diff options
458 files changed, 12335 insertions, 5089 deletions
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl index a0d479d1e1dd..f66f4df18690 100644 --- a/Documentation/DocBook/kernel-locking.tmpl +++ b/Documentation/DocBook/kernel-locking.tmpl @@ -1645,7 +1645,9 @@ the amount of locking which needs to be done. all the readers who were traversing the list when we deleted the element are finished. We use <function>call_rcu()</function> to register a callback which will actually destroy the object once - the readers are finished. + all pre-existing readers are finished. Alternatively, + <function>synchronize_rcu()</function> may be used to block until + all pre-existing are finished. </para> <para> But how does Read Copy Update know when the readers are @@ -1714,7 +1716,7 @@ the amount of locking which needs to be done. - object_put(obj); + list_del_rcu(&obj->list); cache_num--; -+ call_rcu(&obj->rcu, cache_delete_rcu, obj); ++ call_rcu(&obj->rcu, cache_delete_rcu); } /* Must be holding cache_lock */ @@ -1725,14 +1727,6 @@ the amount of locking which needs to be done. if (++cache_num > MAX_CACHE_SIZE) { struct object *i, *outcast = NULL; list_for_each_entry(i, &cache, list) { -@@ -85,6 +94,7 @@ - obj->popularity = 0; - atomic_set(&obj->refcnt, 1); /* The cache holds a reference */ - spin_lock_init(&obj->lock); -+ INIT_RCU_HEAD(&obj->rcu); - - spin_lock_irqsave(&cache_lock, flags); - __cache_add(obj); @@ -104,12 +114,11 @@ struct object *cache_find(int id) { diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index 790d1a812376..0c134f8afc6f 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -218,13 +218,22 @@ over a rather long period of time, but improvements are always welcome! include: a. Keeping a count of the number of data-structure elements - used by the RCU-protected data structure, including those - waiting for a grace period to elapse. Enforce a limit - on this number, stalling updates as needed to allow - previously deferred frees to complete. - - Alternatively, limit only the number awaiting deferred - free rather than the total number of elements. + used by the RCU-protected data structure, including + those waiting for a grace period to elapse. Enforce a + limit on this number, stalling updates as needed to allow + previously deferred frees to complete. Alternatively, + limit only the number awaiting deferred free rather than + the total number of elements. + + One way to stall the updates is to acquire the update-side + mutex. (Don't try this with a spinlock -- other CPUs + spinning on the lock could prevent the grace period + from ever ending.) Another way to stall the updates + is for the updates to use a wrapper function around + the memory allocator, so that this wrapper function + simulates OOM when there is too much memory awaiting an + RCU grace period. There are of course many other + variations on this theme. b. Limiting update rate. For example, if updates occur only once per hour, then no explicit rate limiting is required, @@ -365,3 +374,26 @@ over a rather long period of time, but improvements are always welcome! and the compiler to freely reorder code into and out of RCU read-side critical sections. It is the responsibility of the RCU update-side primitives to deal with this. + +17. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and + the __rcu sparse checks to validate your RCU code. These + can help find problems as follows: + + CONFIG_PROVE_RCU: check that accesses to RCU-protected data + structures are carried out under the proper RCU + read-side critical section, while holding the right + combination of locks, or whatever other conditions + are appropriate. + + CONFIG_DEBUG_OBJECTS_RCU_HEAD: check that you don't pass the + same object to call_rcu() (or friends) before an RCU + grace period has elapsed since the last time that you + passed that same object to call_rcu() (or friends). + + __rcu sparse checks: tag the pointer to the RCU-protected data + structure with __rcu, and sparse will warn you if you + access that pointer without the services of one of the + variants of rcu_dereference(). + + These debugging aids can help you find problems that are + otherwise extremely difficult to spot. diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 44c6dcc93d6d..862c08ef1fde 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -80,6 +80,24 @@ o A CPU looping with bottom halves disabled. This condition can o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel without invoking schedule(). +o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might + happen to preempt a low-priority task in the middle of an RCU + read-side critical section. This is especially damaging if + that low-priority task is not permitted to run on any other CPU, + in which case the next RCU grace period can never complete, which + will eventually cause the system to run out of memory and hang. + While the system is in the process of running itself out of + memory, you might see stall-warning messages. + +o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that + is running at a higher priority than the RCU softirq threads. + This will prevent RCU callbacks from ever being invoked, + and in a CONFIG_TREE_PREEMPT_RCU kernel will further prevent + RCU grace periods from ever completing. Either way, the + system will eventually run out of memory and hang. In the + CONFIG_TREE_PREEMPT_RCU case, you might see stall-warning + messages. + o A bug in the RCU implementation. o A hardware failure. This is quite unlikely, but has occurred diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index efd8cc95c06b..a851118775d8 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -125,6 +125,17 @@ o "b" is the batch limit for this CPU. If more than this number of RCU callbacks is ready to invoke, then the remainder will be deferred. +o "ci" is the number of RCU callbacks that have been invoked for + this CPU. Note that ci+ql is the number of callbacks that have + been registered in absence of CPU-hotplug activity. + +o "co" is the number of RCU callbacks that have been orphaned due to + this CPU going offline. + +o "ca" is the number of RCU callbacks that have been adopted due to + other CPUs going offline. Note that ci+co-ca+ql is the number of + RCU callbacks registered on this CPU. + There is also an rcu/rcudata.csv file with the same information in comma-separated-variable spreadsheet format. @@ -180,7 +191,7 @@ o "s" is the "signaled" state that drives force_quiescent_state()'s o "jfq" is the number of jiffies remaining for this grace period before force_quiescent_state() is invoked to help push things - along. Note that CPUs in dyntick-idle mode thoughout the grace + along. Note that CPUs in dyntick-idle mode throughout the grace period will not report on their own, but rather must be check by some other CPU via force_quiescent_state(). diff --git a/Documentation/networking/e1000.txt b/Documentation/networking/e1000.txt index 2df71861e578..d9271e74e488 100644 --- a/Documentation/networking/e1000.txt +++ b/Documentation/networking/e1000.txt @@ -1,82 +1,35 @@ Linux* Base Driver for the Intel(R) PRO/1000 Family of Adapters =============================================================== -September 26, 2006 - +Intel Gigabit Linux driver. +Copyright(c) 1999 - 2010 Intel Corporation. Contents ======== -- In This Release - Identifying Your Adapter -- Building and Installation - Command Line Parameters - Speed and Duplex Configuration - Additional Configurations -- Known Issues - Support - -In This Release -=============== - -This file describes the Linux* Base Driver for the Intel(R) PRO/1000 Family -of Adapters. This driver includes support for Itanium(R)2-based systems. - -For questions related to hardware requirements, refer to the documentation -supplied with your Intel PRO/1000 adapter. All hardware requirements listed -apply to use with Linux. - -The following features are now available in supported kernels: - - Native VLANs - - Channel Bonding (teaming) - - SNMP - -Channel Bonding documentation can be found in the Linux kernel source: -/Documentation/networking/bonding.txt - -The driver information previously displayed in the /proc filesystem is not -supported in this release. Alternatively, you can use ethtool (version 1.6 -or later), lspci, and ifconfig to obtain the same information. - -Instructions on updating ethtool can be found in the section "Additional -Configurations" later in this document. - -NOTE: The Intel(R) 82562v 10/100 Network Connection only provides 10/100 -support. - - Identifying Your Adapter ======================== For more information on how to identify your adapter, go to the Adapter & Driver ID Guide at: - http://support.intel.com/support/network/adapter/pro100/21397.htm + http://support.intel.com/support/go/network/adapter/idguide.htm For the latest Intel network drivers for Linux, refer to the following website. In the search field, enter your adapter name or type, or use the networking link on the left to search for your adapter: - http://downloadfinder.intel.com/scripts-df/support_intel.asp - + http://support.intel.com/support/go/network/adapter/home.htm Command Line Parameters ======================= -If the driver is built as a module, the following optional parameters -are used by entering them on the command line with the modprobe command -using this syntax: - - modprobe e1000 [<option>=<VAL1>,<VAL2>,...] - -For example, with two PRO/1000 PCI adapters, entering: - - modprobe e1000 TxDescriptors=80,128 - -loads the e1000 driver with 80 TX descriptors for the first adapter and -128 TX descriptors for the second adapter. - The default value for each parameter is generally the recommended setting, unless otherwise noted. @@ -89,10 +42,6 @@ NOTES: For more information about the AutoNeg, Duplex, and Speed parameters, see the application note at: http://www.intel.com/design/network/applnots/ap450.htm - A descriptor describes a data buffer and attributes related to - the data buffer. This information is accessed by the hardware. - - AutoNeg ------- (Supported only on adapters with copper connections) @@ -106,7 +55,6 @@ Duplex parameters must not be specified. NOTE: Refer to the Speed and Duplex section of this readme for more information on the AutoNeg parameter. - Duplex ------ (Supported only on adapters with copper connections) @@ -119,7 +67,6 @@ set to auto-negotiate, the board auto-detects the correct duplex. If the link partner is forced (either full or half), Duplex defaults to half- duplex. - FlowControl ----------- Valid Range: 0-3 (0=none, 1=Rx only, 2=Tx only, 3=Rx&Tx) @@ -128,16 +75,16 @@ Default Value: Reads flow control settings from the EEPROM This parameter controls the automatic generation(Tx) and response(Rx) to Ethernet PAUSE frames. - InterruptThrottleRate --------------------- (not supported on Intel(R) 82542, 82543 or 82544-based adapters) -Valid Range: 0,1,3,100-100000 (0=off, 1=dynamic, 3=dynamic conservative) +Valid Range: 0,1,3,4,100-100000 (0=off, 1=dynamic, 3=dynamic conservative, + 4=simplified balancing) Default Value: 3 The driver can limit the amount of interrupts per second that the adapter -will generate for incoming packets. It does this by writing a value to the -adapter that is based on the maximum amount of interrupts that the adapter +will generate for incoming packets. It does this by writing a value to the +adapter that is based on the maximum amount of interrupts that the adapter will generate per second. Setting InterruptThrottleRate to a value greater or equal to 100 @@ -146,37 +93,43 @@ per second, even if more packets have come in. This reduces interrupt load on the system and can lower CPU utilization under heavy load, but will increase latency as packets are not processed as quickly. -The default behaviour of the driver previously assumed a static -InterruptThrottleRate value of 8000, providing a good fallback value for -all traffic types,but lacking in small packet performance and latency. -The hardware can handle many more small packets per second however, and +The default behaviour of the driver previously assumed a static +InterruptThrottleRate value of 8000, providing a good fallback value for +all traffic types,but lacking in small packet performance and latency. +The hardware can handle many more small packets per second however, and for this reason an adaptive interrupt moderation algorithm was implemented. Since 7.3.x, the driver has two adaptive modes (setting 1 or 3) in which -it dynamically adjusts the InterruptThrottleRate value based on the traffic +it dynamically adjusts the InterruptThrottleRate value based on the traffic that it receives. After determining the type of incoming traffic in the last -timeframe, it will adjust the InterruptThrottleRate to an appropriate value +timeframe, it will adjust the InterruptThrottleRate to an appropriate value for that traffic. The algorithm classifies the incoming traffic every interval into -classes. Once the class is determined, the InterruptThrottleRate value is -adjusted to suit that traffic type the best. There are three classes defined: +classes. Once the class is determined, the InterruptThrottleRate value is +adjusted to suit that traffic type the best. There are three classes defined: "Bulk traffic", for large amounts of packets of normal size; "Low latency", for small amounts of traffic and/or a significant percentage of small -packets; and "Lowest latency", for almost completely small packets or +packets; and "Lowest latency", for almost completely small packets or minimal traffic. -In dynamic conservative mode, the InterruptThrottleRate value is set to 4000 -for traffic that falls in class "Bulk traffic". If traffic falls in the "Low -latency" or "Lowest latency" class, the InterruptThrottleRate is increased +In dynamic conservative mode, the InterruptThrottleRate value is set to 4000 +for traffic that falls in class "Bulk traffic". If traffic falls in the "Low +latency" or "Lowest latency" class, the InterruptThrottleRate is increased stepwise to 20000. This default mode is suitable for most applications. For situations where low latency is vital such as cluster or grid computing, the algorithm can reduce latency even more when InterruptThrottleRate is set to mode 1. In this mode, which operates -the same as mode 3, the InterruptThrottleRate will be increased stepwise to +the same as mode 3, the InterruptThrottleRate will be increased stepwise to 70000 for traffic in class "Lowest latency". +In simplified mode the interrupt rate is based on the ratio of Tx and +Rx traffic. If the bytes per second rate is approximately equal, the +interrupt rate will drop as low as 2000 interrupts per second. If the +traffic is mostly transmit or mostly receive, the interrupt rate could +be as high as 8000. + Setting InterruptThrottleRate to 0 turns off any interrupt moderation and may improve small packet latency, but is generally not suitable for bulk throughput traffic. @@ -212,8 +165,6 @@ NOTE: When e1000 is loaded with default settings and multiple adapters be platform-specific. If CPU utilization is not a concern, use RX_POLLING (NAPI) and default driver settings. - - RxDescriptors ------------- Valid Range: 80-256 for 82542 and 82543-based adapters @@ -225,15 +176,14 @@ by the driver. Increasing this value allows the driver to buffer more incoming packets, at the expense of increased system memory utilization. Each descriptor is 16 bytes. A receive buffer is also allocated for each -descriptor and can be either 2048, 4096, 8192, or 16384 bytes, depending +descriptor and can be either 2048, 4096, 8192, or 16384 bytes, depending on the MTU setting. The maximum MTU size is 16110. -NOTE: MTU designates the frame size. It only needs to be set for Jumbo - Frames. Depending on the available system resources, the request - for a higher number of receive descriptors may be denied. In this +NOTE: MTU designates the frame size. It only needs to be set for Jumbo + Frames. Depending on the available system resources, the request + for a higher number of receive descriptors may be denied. In this case, use a lower number. - RxIntDelay ---------- Valid Range: 0-65535 (0=off) @@ -254,7 +204,6 @@ CAUTION: When setting RxIntDelay to a value other than 0, adapters may restoring the network connection. To eliminate the potential for the hang ensure that RxIntDelay is set to 0. - RxAbsIntDelay ------------- (This parameter is supported only on 82540, 82545 and later adapters.) @@ -268,7 +217,6 @@ packet is received within the set amount of time. Proper tuning, along with RxIntDelay, may improve traffic throughput in specific network conditions. - Speed ----- (This parameter is supported only on adapters with copper connections.) @@ -280,7 +228,6 @@ Speed forces the line speed to the specified value in megabits per second partner is set to auto-negotiate, the board will auto-detect the correct speed. Duplex should also be set when Speed is set to either 10 or 100. - TxDescriptors ------------- Valid Range: 80-256 for 82542 and 82543-based adapters @@ -295,6 +242,36 @@ NOTE: Depending on the available system resources, the request for a higher number of transmit descriptors may be denied. In this case, use a lower number. +TxDescriptorStep +---------------- +Valid Range: 1 (use every Tx Descriptor) + 4 (use every 4th Tx Descriptor) + +Default Value: 1 (use every Tx Descriptor) + +On certain non-Intel architectures, it has been observed that intense TX +traffic bursts of short packets may result in an improper descriptor +writeback. If this occurs, the driver will report a "TX Timeout" and reset +the adapter, after which the transmit flow will restart, though data may +have stalled for as much as 10 seconds before it resumes. + +The improper writeback does not occur on the first descriptor in a system +memory cache-line, which is typically 32 bytes, or 4 descriptors long. + +Setting TxDescriptorStep to a value of 4 will ensure that all TX descriptors +are aligned to the start of a system memory cache line, and so this problem +will not occur. + +NOTES: Setting TxDescriptorStep to 4 effectively reduces the number of + TxDescriptors available for transmits to 1/4 of the normal allocation. + This has a possible negative performance impact, which may be + compensated for by allocating more descriptors using the TxDescriptors + module parameter. + + There are other conditions which may result in "TX Timeout", which will + not be resolved by the use of the TxDescriptorStep parameter. As the + issue addressed by this parameter has never been observed on Intel + Architecture platforms, it should not be used on Intel platforms. TxIntDelay ---------- @@ -307,7 +284,6 @@ efficiency if properly tuned for specific network traffic. If the system is reporting dropped transmits, this value may be set too high causing the driver to run out of available transmit descriptors. - TxAbsIntDelay ------------- (This parameter is supported only on 82540, 82545 and later adapters.) @@ -330,6 +306,35 @@ Default Value: 1 A value of '1' indicates that the driver should enable IP checksum offload for received packets (both UDP and TCP) to the adapter hardware. +Copybreak +--------- +Valid Range: 0-xxxxxxx (0=off) +Default Value: 256 +Usage: insmod e1000.ko copybreak=128 + +Driver copies all packets below or equaling this size to a fresh Rx +buffer before handing it up the stack. + +This parameter is different than other parameters, in that it is a +single (not 1,1,1 etc.) parameter applied to all driver instances and +it is also available during runtime at +/sys/module/e1000/parameters/copybreak + +SmartPowerDownEnable +-------------------- +Valid Range: 0-1 +Default Value: 0 (disabled) + +Allows PHY to turn off in lower power states. The user can turn off +this parameter in supported chipsets. + +KumeranLockLoss +--------------- +Valid Range: 0-1 +Default Value: 1 (enabled) + +This workaround skips resetting the PHY at shutdown for the initial +silicon releases of ICH8 systems. Speed and Duplex Configuration ============================== @@ -385,40 +390,9 @@ If the link partner is forced to a specific speed and duplex, then this parameter should not be used. Instead, use the Speed and Duplex parameters previously mentioned to force the adapter to the same speed and duplex. - Additional Configurations ========================= - Configuring the Driver on Different Distributions - ------------------------------------------------- - Configuring a network driver to load properly when the system is started - is distribution dependent. Typically, the configuration process involves - adding an alias line to /etc/modules.conf or /etc/modprobe.conf as well - as editing other system startup scripts and/or configuration files. Many - popular Linux distributions ship with tools to make these changes for you. - To learn the proper way to configure a network device for your system, - refer to your distribution documentation. If during this process you are - asked for the driver or module name, the name for the Linux Base Driver - for the Intel(R) PRO/1000 Family of Adapters is e1000. - - As an example, if you install the e1000 driver for two PRO/1000 adapters - (eth0 and eth1) and set the speed and duplex to 10full and 100half, add - the following to modules.conf or or modprobe.conf: - - alias eth0 e1000 - alias eth1 e1000 - options e1000 Speed=10,100 Duplex=2,1 - - Viewing Link Messages - --------------------- - Link messages will not be displayed to the console if the distribution is - restricting system messages. In order to see network driver link messages - on your console, set dmesg to eight by entering the following: - - dmesg -n 8 - - NOTE: This setting is not saved across reboots. - Jumbo Frames ------------ Jumbo Frames support is enabled by changing the MTU to a value larger than @@ -437,9 +411,11 @@ Additional Configurations setting in a different location. Notes: - - - To enable Jumbo Frames, increase the MTU size on the interface beyond - 1500. + Degradation in throughput performance may be observed in some Jumbo frames + environments. If this is observed, increasing the application's socket buffer + size and/or increasing the /proc/sys/net/ipv4/tcp_*mem entry values may help. + See the specific application manual and /usr/src/linux*/Documentation/ + networking/ip-sysctl.txt for more details. - The maximum MTU setting for Jumbo Frames is 16110. This value coincides with the maximum Jumbo Frames size of 16128. @@ -447,40 +423,11 @@ Additional Configurations - Using Jumbo Frames at 10 or 100 Mbps may result in poor performance or loss of link. - - Some Intel gigabit adapters that support Jumbo Frames have a frame size - limit of 9238 bytes, with a corresponding MTU size limit of 9216 bytes. - The adapters with this limitation are based on the Intel(R) 82571EB, - 82572EI, 82573L and 80003ES2LAN controller. These correspond to the - following product names: - Intel(R) PRO/1000 PT Server Adapter - Intel(R) PRO/1000 PT Desktop Adapter - Intel(R) PRO/1000 PT Network Connection - Intel(R) PRO/1000 PT Dual Port Server Adapter - Intel(R) PRO/1000 PT Dual Port Network Connection - Intel(R) PRO/1000 PF Server Adapter - Intel(R) PRO/1000 PF Network Connection - Intel(R) PRO/1000 PF Dual Port Server Adapter - Intel(R) PRO/1000 PB Server Connection - Intel(R) PRO/1000 PL Network Connection - Intel(R) PRO/1000 EB Network Connection with I/O Acceleration - Intel(R) PRO/1000 EB Backplane Connection with I/O Acceleration - Intel(R) PRO/1000 PT Quad Port Server Adapter - - Adapters based on the Intel(R) 82542 and 82573V/E controller do not support Jumbo Frames. These correspond to the following product names: Intel(R) PRO/1000 Gigabit Server Adapter Intel(R) PRO/1000 PM Network Connection - - The following adapters do not support Jumbo Frames: - Intel(R) 82562V 10/100 Network Connection - Intel(R) 82566DM Gigabit Network Connection - Intel(R) 82566DC Gigabit Network Connection - Intel(R) 82566MM Gigabit Network Connection - Intel(R) 82566MC Gigabit Network Connection - Intel(R) 82562GT 10/100 Network Connection - Intel(R) 82562G 10/100 Network Connection - - Ethtool ------- The driver utilizes the ethtool interface for driver configuration and @@ -490,142 +437,14 @@ Additional Configurations The latest release of ethtool can be found from http://sourceforge.net/projects/gkernel. - NOTE: Ethtool 1.6 only supports a limited set of ethtool options. Support - for a more complete ethtool feature set can be enabled by upgrading - ethtool to ethtool-1.8.1. - Enabling Wake on LAN* (WoL) --------------------------- - WoL is configured through the Ethtool* utility. Ethtool is included with - all versions of Red Hat after Red Hat 7.2. For other Linux distributions, - download and install Ethtool from the following website: - http://sourceforge.net/projects/gkernel. - - For instructions on enabling WoL with Ethtool, refer to the website listed - above. + WoL is configured through the Ethtool* utility. WoL will be enabled on the system during the next shut down or reboot. For this driver version, in order to enable WoL, the e1000 driver must be loaded when shutting down or rebooting the system. - Wake On LAN is only supported on port A for the following devices: - Intel(R) PRO/1000 PT Dual Port Network Connection - Intel(R) PRO/1000 PT Dual Port Server Connection - Intel(R) PRO/1000 PT Dual Port Server Adapter - Intel(R) PRO/1000 PF Dual Port Server Adapter - Intel(R) PRO/1000 PT Quad Port Server Adapter - - NAPI - ---- - NAPI (Rx polling mode) is enabled in the e1000 driver. - - See www.cyberus.ca/~hadi/usenix-paper.tgz for more information on NAPI. - - -Known Issues -============ - -Dropped Receive Packets on Half-duplex 10/100 Networks ------------------------------------------------------- -If you have an Intel PCI Express adapter running at 10mbps or 100mbps, half- -duplex, you may observe occasional dropped receive packets. There are no -workarounds for this problem in this network configuration. The network must -be updated to operate in full-duplex, and/or 1000mbps only. - -Jumbo Frames System Requirement -------------------------------- -Memory allocation failures have been observed on Linux systems with 64 MB -of RAM or less that are running Jumbo Frames. If you are using Jumbo -Frames, your system may require more than the advertised minimum -requirement of 64 MB of system memory. - -Performance Degradation with Jumbo Frames ------------------------------------------ -Degradation in throughput performance may be observed in some Jumbo frames -environments. If this is observed, increasing the application's socket -buffer size and/or increasing the /proc/sys/net/ipv4/tcp_*mem entry values -may help. See the specific application manual and -/usr/src/linux*/Documentation/ -networking/ip-sysctl.txt for more details. - -Jumbo Frames on Foundry BigIron 8000 switch -------------------------------------------- -There is a known issue using Jumbo frames when connected to a Foundry -BigIron 8000 switch. This is a 3rd party limitation. If you experience -loss of packets, lower the MTU size. - -Allocating Rx Buffers when Using Jumbo Frames ---------------------------------------------- -Allocating Rx buffers when using Jumbo Frames on 2.6.x kernels may fail if -the available memory is heavily fragmented. This issue may be seen with PCI-X -adapters or with packet split disabled. This can be reduced or eliminated -by changing the amount of available memory for receive buffer allocation, by -increasing /proc/sys/vm/min_free_kbytes. - -Multiple Interfaces on Same Ethernet Broadcast Network ------------------------------------------------------- -Due to the default ARP behavior on Linux, it is not possible to have -one system on two IP networks in the same Ethernet broadcast domain -(non-partitioned switch) behave as expected. All Ethernet interfaces -will respond to IP traffic for any IP address assigned to the system. -This results in unbalanced receive traffic. - -If you have multiple interfaces in a server, either turn on ARP -filtering by entering: - - echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter -(this only works if your kernel's version is higher than 2.4.5), - -NOTE: This setting is not saved across reboots. The configuration -change can be made permanent by adding the line: - net.ipv4.conf.all.arp_filter = 1 -to the file /etc/sysctl.conf - - or, - -install the interfaces in separate broadcast domains (either in -different switches or in a switch partitioned to VLANs). - -82541/82547 can't link or are slow to link with some link partners ------------------------------------------------------------------ -There is a known compatibility issue with 82541/82547 and some -low-end switches where the link will not be established, or will -be slow to establish. In particular, these switches are known to -be incompatible with 82541/82547: - - Planex FXG-08TE - I-O Data ETG-SH8 - -To workaround this issue, the driver can be compiled with an override -of the PHY's master/slave setting. Forcing master or forcing slave -mode will improve time-to-link. - - # make CFLAGS_EXTRA=-DE1000_MASTER_SLAVE=<n> - -Where <n> is: - - 0 = Hardware default - 1 = Master mode - 2 = Slave mode - 3 = Auto master/slave - -Disable rx flow control with ethtool ------------------------------------- -In order to disable receive flow control using ethtool, you must turn -off auto-negotiation on the same command line. - -For example: - - ethtool -A eth? autoneg off rx off - -Unplugging network cable while ethtool -p is running ----------------------------------------------------- -In kernel versions 2.5.50 and later (including 2.6 kernel), unplugging -the network cable while ethtool -p is running will cause the system to -become unresponsive to keyboard commands, except for control-alt-delete. -Restarting the system appears to be the only remedy. - - Support ======= diff --git a/Documentation/networking/e1000e.txt b/Documentation/networking/e1000e.txt new file mode 100644 index 000000000000..6aa048badf32 --- /dev/null +++ b/Documentation/networking/e1000e.txt @@ -0,0 +1,302 @@ +Linux* Driver for Intel(R) Network Connection +=============================================================== + +Intel Gigabit Linux driver. +Copyright(c) 1999 - 2010 Intel Corporation. + +Contents +======== + +- Identifying Your Adapter +- Command Line Parameters +- Additional Configurations +- Support + +Identifying Your Adapter +======================== + +The e1000e driver supports all PCI Express Intel(R) Gigabit Network +Connections, except those that are 82575, 82576 and 82580-based*. + +* NOTE: The Intel(R) PRO/1000 P Dual Port Server Adapter is supported by + the e1000 driver, not the e1000e driver due to the 82546 part being used + behind a PCI Express bridge. + +For more information on how to identify your adapter, go to the Adapter & +Driver ID Guide at: + + http://support.intel.com/support/go/network/adapter/idguide.htm + +For the latest Intel network drivers for Linux, refer to the following +website. In the search field, enter your adapter name or type, or use the +networking link on the left to search for your adapter: + + http://support.intel.com/support/go/network/adapter/home.htm + +Command Line Parameters +======================= + +The default value for each parameter is generally the recommended setting, +unless otherwise noted. + +NOTES: For more information about the InterruptThrottleRate, + RxIntDelay, TxIntDelay, RxAbsIntDelay, and TxAbsIntDelay + parameters, see the application note at: + http://www.intel.com/design/network/applnots/ap450.htm + +InterruptThrottleRate +--------------------- +Valid Range: 0,1,3,4,100-100000 (0=off, 1=dynamic, 3=dynamic conservative, + 4=simplified balancing) +Default Value: 3 + +The driver can limit the amount of interrupts per second that the adapter +will generate for incoming packets. It does this by writing a value to the +adapter that is based on the maximum amount of interrupts that the adapter +will generate per second. + +Setting InterruptThrottleRate to a value greater or equal to 100 +will program the adapter to send out a maximum of that many interrupts +per second, even if more packets have come in. This reduces interrupt +load on the system and can lower CPU utilization under heavy load, +but will increase latency as packets are not processed as quickly. + +The driver has two adaptive modes (setting 1 or 3) in which +it dynamically adjusts the InterruptThrottleRate value based on the traffic +that it receives. After determining the type of incoming traffic in the last +timeframe, it will adjust the InterruptThrottleRate to an appropriate value +for that traffic. + +The algorithm classifies the incoming traffic every interval into +classes. Once the class is determined, the InterruptThrottleRate value is +adjusted to suit that traffic type the best. There are three classes defined: +"Bulk traffic", for large amounts of packets of normal size; "Low latency", +for small amounts of traffic and/or a significant percentage of small +packets; and "Lowest latency", for almost completely small packets or +minimal traffic. + +In dynamic conservative mode, the InterruptThrottleRate value is set to 4000 +for traffic that falls in class "Bulk traffic". If traffic falls in the "Low +latency" or "Lowest latency" class, the InterruptThrottleRate is increased +stepwise to 20000. This default mode is suitable for most applications. + +For situations where low latency is vital such as cluster or +grid computing, the algorithm can reduce latency even more when +InterruptThrottleRate is set to mode 1. In this mode, which operates +the same as mode 3, the InterruptThrottleRate will be increased stepwise to +70000 for traffic in class "Lowest latency". + +In simplified mode the interrupt rate is based on the ratio of Tx and +Rx traffic. If the bytes per second rate is approximately equal the +interrupt rate will drop as low as 2000 interrupts per second. If the +traffic is mostly transmit or mostly receive, the interrupt rate could +be as high as 8000. + +Setting InterruptThrottleRate to 0 turns off any interrupt moderation +and may improve small packet latency, but is generally not suitable +for bulk throughput traffic. + +NOTE: InterruptThrottleRate takes precedence over the TxAbsIntDelay and + RxAbsIntDelay parameters. In other words, minimizing the receive + and/or transmit absolute delays does not force the controller to + generate more interrupts than what the Interrupt Throttle Rate + allows. + +NOTE: When e1000e is loaded with default settings and multiple adapters + are in use simultaneously, the CPU utilization may increase non- + linearly. In order to limit the CPU utilization without impacting + the overall throughput, we recommend that you load the driver as + follows: + + modprobe e1000e InterruptThrottleRate=3000,3000,3000 + + This sets the InterruptThrottleRate to 3000 interrupts/sec for + the first, second, and third instances of the driver. The range + of 2000 to 3000 interrupts per second works on a majority of + systems and is a good starting point, but the optimal value will + be platform-specific. If CPU utilization is not a concern, use + RX_POLLING (NAPI) and default driver settings. + +RxIntDelay +---------- +Valid Range: 0-65535 (0=off) +Default Value: 0 + +This value delays the generation of receive interrupts in units of 1.024 +microseconds. Receive interrupt reduction can improve CPU efficiency if +properly tuned for specific network traffic. Increasing this value adds +extra latency to frame reception and can end up decreasing the throughput +of TCP traffic. If the system is reporting dropped receives, this value +may be set too high, causing the driver to run out of available receive +descriptors. + +CAUTION: When setting RxIntDelay to a value other than 0, adapters may + hang (stop transmitting) under certain network conditions. If + this occurs a NETDEV WATCHDOG message is logged in the system + event log. In addition, the controller is automatically reset, + restoring the network connection. To eliminate the potential + for the hang ensure that RxIntDelay is set to 0. + +RxAbsIntDelay +------------- +Valid Range: 0-65535 (0=off) +Default Value: 8 + +This value, in units of 1.024 microseconds, limits the delay in which a +receive interrupt is generated. Useful only if RxIntDelay is non-zero, +this value ensures that an interrupt is generated after the initial +packet is received within the set amount of time. Proper tuning, +along with RxIntDelay, may improve traffic throughput in specific network +conditions. + +TxIntDelay +---------- +Valid Range: 0-65535 (0=off) +Default Value: 8 + +This value delays the generation of transmit interrupts in units of +1.024 microseconds. Transmit interrupt reduction can improve CPU +efficiency if properly tuned for specific network traffic. If the +system is reporting dropped transmits, this value may be set too high +causing the driver to run out of available transmit descriptors. + +TxAbsIntDelay +------------- +Valid Range: 0-65535 (0=off) +Default Value: 32 + +This value, in units of 1.024 microseconds, limits the delay in which a +transmit interrupt is generated. Useful only if TxIntDelay is non-zero, +this value ensures that an interrupt is generated after the initial +packet is sent on the wire within the set amount of time. Proper tuning, +along with TxIntDelay, may improve traffic throughput in specific +network conditions. + +Copybreak +--------- +Valid Range: 0-xxxxxxx (0=off) +Default Value: 256 + +Driver copies all packets below or equaling this size to a fresh Rx +buffer before handing it up the stack. + +This parameter is different than other parameters, in that it is a +single (not 1,1,1 etc.) parameter applied to all driver instances and +it is also available during runtime at +/sys/module/e1000e/parameters/copybreak + +SmartPowerDownEnable +-------------------- +Valid Range: 0-1 +Default Value: 0 (disabled) + +Allows PHY to turn off in lower power states. The user can set this parameter +in supported chipsets. + +KumeranLockLoss +--------------- +Valid Range: 0-1 +Default Value: 1 (enabled) + +This workaround skips resetting the PHY at shutdown for the initial +silicon releases of ICH8 systems. + +IntMode +------- +Valid Range: 0-2 (0=legacy, 1=MSI, 2=MSI-X) +Default Value: 2 + +Allows changing the interrupt mode at module load time, without requiring a +recompile. If the driver load fails to enable a specific interrupt mode, the +driver will try other interrupt modes, from least to most compatible. The +interrupt order is MSI-X, MSI, Legacy. If specifying MSI (IntMode=1) +interrupts, only MSI and Legacy will be attempted. + +CrcStripping +------------ +Valid Range: 0-1 +Default Value: 1 (enabled) + +Strip the CRC from received packets before sending up the network stack. If +you have a machine with a BMC enabled but cannot receive IPMI traffic after +loading or enabling the driver, try disabling this feature. + +WriteProtectNVM +--------------- +Valid Range: 0-1 +Default Value: 1 (enabled) + +Set the hardware to ignore all write/erase cycles to the GbE region in the +ICHx NVM (non-volatile memory). This feature can be disabled by the +WriteProtectNVM module parameter (enabled by default) only after a hardware +reset, but the machine must be power cycled before trying to enable writes. + +Note: the kernel boot option iomem=relaxed may need to be set if the kernel +config option CONFIG_STRICT_DEVMEM=y, if the root user wants to write the +NVM from user space via ethtool. + +Additional Configurations +========================= + + Jumbo Frames + ------------ + Jumbo Frames support is enabled by changing the MTU to a value larger than + the default of 1500. Use the ifconfig command to increase the MTU size. + For example: + + ifconfig eth<x> mtu 9000 up + + This setting is not saved across reboots. + + Notes: + + - The maximum MTU setting for Jumbo Frames is 9216. This value coincides + with the maximum Jumbo Frames size of 9234 bytes. + + - Using Jumbo Frames at 10 or 100 Mbps is not supported and may result in + poor performance or loss of link. + + - Some adapters limit Jumbo Frames sized packets to a maximum of + 4096 bytes and some adapters do not support Jumbo Frames. + + + Ethtool + ------- + The driver utilizes the ethtool interface for driver configuration and + diagnostics, as well as displaying statistical information. We + strongly recommend downloading the latest version of Ethtool at: + + http://sourceforge.net/projects/gkernel. + + Speed and Duplex + ---------------- + Speed and Duplex are configured through the Ethtool* utility. For + instructions, refer to the Ethtool man page. + + Enabling Wake on LAN* (WoL) + --------------------------- + WoL is configured through the Ethtool* utility. For instructions on + enabling WoL with Ethtool, refer to the Ethtool man page. + + WoL will be enabled on the system during the next shut down or reboot. + For this driver version, in order to enable WoL, the e1000e driver must be + loaded when shutting down or rebooting the system. + + In most cases Wake On LAN is only supported on port A for multiple port + adapters. To verify if a port supports Wake on LAN run ethtool eth<X>. + + +Support +======= + +For general information, go to the Intel support website at: + + www.intel.com/support/ + +or the Intel Wired Networking project hosted by Sourceforge at: + + http://sourceforge.net/projects/e1000 + +If an issue is identified with the released source code on the supported +kernel with a supported adapter, email the specific information related +to the issue to e1000-devel@lists.sf.net diff --git a/Documentation/networking/ixgbevf.txt b/Documentation/networking/ixgbevf.txt index 19015de6725f..21dd5d15b6b4 100755..100644 --- a/Documentation/networking/ixgbevf.txt +++ b/Documentation/networking/ixgbevf.txt @@ -1,19 +1,16 @@ Linux* Base Driver for Intel(R) Network Connection ================================================== -November 24, 2009 +Intel Gigabit Linux driver. +Copyright(c) 1999 - 2010 Intel Corporation. Contents ======== -- In This Release - Identifying Your Adapter - Known Issues/Troubleshooting - Support -In This Release -=============== - This file describes the ixgbevf Linux* Base Driver for Intel Network Connection. @@ -33,7 +30,7 @@ Identifying Your Adapter For more information on how to identify your adapter, go to the Adapter & Driver ID Guide at: - http://support.intel.com/support/network/sb/CS-008441.htm + http://support.intel.com/support/go/network/adapter/idguide.htm Known Issues/Troubleshooting ============================ @@ -57,34 +54,3 @@ or the Intel Wired Networking project hosted by Sourceforge at: If an issue is identified with the released source code on the supported kernel with a supported adapter, email the specific information related to the issue to e1000-devel@lists.sf.net - -License -======= - -Intel 10 Gigabit Linux driver. -Copyright(c) 1999 - 2009 Intel Corporation. - -This program is free software; you can redistribute it and/or modify it -under the terms and conditions of the GNU General Public License, -version 2, as published by the Free Software Foundation. - -This program is distributed in the hope it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - -The full GNU General Public License is included in this distribution in -the file called "COPYING". - -Trademarks -========== - -Intel, Itanium, and Pentium are trademarks or registered trademarks of -Intel Corporation or its subsidiaries in the United States and other -countries. - -* Other names and brands may be claimed as the property of others. diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index ccd951fa94ee..cc96ee2666f2 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c @@ -478,7 +478,7 @@ static void prepare_hwpoison_fd(void) } if (opt_unpoison && !hwpoison_forget_fd) { - sprintf(buf, "%s/renew-pfn", hwpoison_debug_fs); + sprintf(buf, "%s/unpoison-pfn", hwpoison_debug_fs); hwpoison_forget_fd = checked_open(buf, O_WRONLY); } } diff --git a/MAINTAINERS b/MAINTAINERS index f46d8e66333f..3d4179fbc526 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -969,6 +969,16 @@ L: linux-samsung-soc@vger.kernel.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-s5p*/ +ARM/SAMSUNG S5P SERIES FIMC SUPPORT +M: Kyungmin Park <kyungmin.park@samsung.com> +M: Sylwester Nawrocki <s.nawrocki@samsung.com> +L: linux-arm-kernel@lists.infradead.org +L: linux-media@vger.kernel.org +S: Maintained +F: arch/arm/plat-s5p/dev-fimc* +F: arch/arm/plat-samsung/include/plat/*fimc* +F: drivers/media/video/s5p-fimc/ + ARM/SHMOBILE ARM ARCHITECTURE M: Paul Mundt <lethal@linux-sh.org> M: Magnus Damm <magnus.damm@gmail.com> @@ -1517,6 +1527,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git S: Supported F: Documentation/filesystems/ceph.txt F: fs/ceph +F: net/ceph +F: include/linux/ceph CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM: M: David Vrabel <david.vrabel@csr.com> @@ -2535,7 +2547,7 @@ S: Supported F: drivers/scsi/gdt* GENERIC GPIO I2C DRIVER -M: Haavard Skinnemoen <hskinnemoen@atmel.com> +M: Haavard Skinnemoen <hskinnemoen@gmail.com> S: Supported F: drivers/i2c/busses/i2c-gpio.c F: include/linux/i2c-gpio.h @@ -3063,16 +3075,27 @@ L: netdev@vger.kernel.org S: Maintained F: drivers/net/ixp2000/ -INTEL ETHERNET DRIVERS (e100/e1000/e1000e/igb/igbvf/ixgb/ixgbe) +INTEL ETHERNET DRIVERS (e100/e1000/e1000e/igb/igbvf/ixgb/ixgbe/ixgbevf) M: Jeff Kirsher <jeffrey.t.kirsher@intel.com> M: Jesse Brandeburg <jesse.brandeburg@intel.com> M: Bruce Allan <bruce.w.allan@intel.com> -M: Alex Duyck <alexander.h.duyck@intel.com> +M: Carolyn Wyborny <carolyn.wyborny@intel.com> +M: Don Skidmore <donald.c.skidmore@intel.com> +M: Greg Rose <gregory.v.rose@intel.com> M: PJ Waskiewicz <peter.p.waskiewicz.jr@intel.com> +M: Alex Duyck <alexander.h.duyck@intel.com> M: John Ronciak <john.ronciak@intel.com> L: e1000-devel@lists.sourceforge.net W: http://e1000.sourceforge.net/ S: Supported +F: Documentation/networking/e100.txt +F: Documentation/networking/e1000.txt +F: Documentation/networking/e1000e.txt +F: Documentation/networking/igb.txt +F: Documentation/networking/igbvf.txt +F: Documentation/networking/ixgb.txt +F: Documentation/networking/ixgbe.txt +F: Documentation/networking/ixgbevf.txt F: drivers/net/e100.c F: drivers/net/e1000/ F: drivers/net/e1000e/ @@ -3080,6 +3103,7 @@ F: drivers/net/igb/ F: drivers/net/igbvf/ F: drivers/net/ixgb/ F: drivers/net/ixgbe/ +F: drivers/net/ixgbevf/ INTEL PRO/WIRELESS 2100 NETWORK CONNECTION SUPPORT L: linux-wireless@vger.kernel.org @@ -3140,7 +3164,7 @@ F: drivers/net/ioc3-eth.c IOC3 SERIAL DRIVER M: Pat Gefre <pfg@sgi.com> -L: linux-mips@linux-mips.org +L: linux-serial@vger.kernel.org S: Maintained F: drivers/serial/ioc3_serial.c @@ -4783,6 +4807,15 @@ F: fs/qnx4/ F: include/linux/qnx4_fs.h F: include/linux/qnxtypes.h +RADOS BLOCK DEVICE (RBD) +F: include/linux/qnxtypes.h +M: Yehuda Sadeh <yehuda@hq.newdream.net> +M: Sage Weil <sage@newdream.net> +M: ceph-devel@vger.kernel.org +S: Supported +F: drivers/block/rbd.c +F: drivers/block/rbd_types.h + RADEON FRAMEBUFFER DISPLAY DRIVER M: Benjamin Herrenschmidt <benh@kernel.crashing.org> L: linux-fbdev@vger.kernel.org @@ -5008,6 +5041,12 @@ F: drivers/media/common/saa7146* F: drivers/media/video/*7146* F: include/media/*7146* +SAMSUNG AUDIO (ASoC) DRIVERS +M: Jassi Brar <jassi.brar@samsung.com> +L: alsa-devel@alsa-project.org (moderated for non-subscribers) +S: Supported +F: sound/soc/s3c24xx + TLG2300 VIDEO4LINUX-2 DRIVER M: Huang Shijie <shijie8@gmail.com> M: Kang Yong <kangyong@telegent.com> @@ -6450,8 +6489,10 @@ F: include/linux/wm97xx.h WOLFSON MICROELECTRONICS DRIVERS M: Mark Brown <broonie@opensource.wolfsonmicro.com> M: Ian Lartey <ian@opensource.wolfsonmicro.com> +M: Dimitris Papastamos <dp@opensource.wolfsonmicro.com> +T: git git://opensource.wolfsonmicro.com/linux-2.6-asoc T: git git://opensource.wolfsonmicro.com/linux-2.6-audioplus -W: http://opensource.wolfsonmicro.com/node/8 +W: http://opensource.wolfsonmicro.com/content/linux-drivers-wolfson-devices S: Supported F: Documentation/hwmon/wm83?? F: drivers/leds/leds-wm83*.c @@ -1,8 +1,8 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 36 -EXTRAVERSION = -rc7 -NAME = Sheep on Meth +EXTRAVERSION = +NAME = Flesh-Eating Bats with Fangs # *DOCUMENTATION* # To see a list of typical targets execute "make help" diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 7c0dfccd05bd..9103904b3dab 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1102,6 +1102,20 @@ config ARM_ERRATA_720789 invalidated are not, resulting in an incoherency in the system page tables. The workaround changes the TLB flushing routines to invalidate entries regardless of the ASID. + +config ARM_ERRATA_743622 + bool "ARM errata: Faulty hazard checking in the Store Buffer may lead to data corruption" + depends on CPU_V7 + help + This option enables the workaround for the 743622 Cortex-A9 + (r2p0..r2p2) erratum. Under very rare conditions, a faulty + optimisation in the Cortex-A9 Store Buffer may lead to data + corruption. This workaround sets a specific bit in the diagnostic + register of the Cortex-A9 which disables the Store Buffer + optimisation, preventing the defect from occurring. This has no + visible impact on the overall performance or power consumption of the + processor. + endmenu source "arch/arm/common/Kconfig" diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index 8bccbfa693ff..2c1f0050c9c4 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -1162,11 +1162,12 @@ space_cccc_001x(kprobe_opcode_t insn, struct arch_specific_insn *asi) { /* * MSR : cccc 0011 0x10 xxxx xxxx xxxx xxxx xxxx - * Undef : cccc 0011 0x00 xxxx xxxx xxxx xxxx xxxx + * Undef : cccc 0011 0100 xxxx xxxx xxxx xxxx xxxx * ALU op with S bit and Rd == 15 : * cccc 001x xxx1 xxxx 1111 xxxx xxxx xxxx */ - if ((insn & 0x0f900000) == 0x03200000 || /* MSR & Undef */ + if ((insn & 0x0fb00000) == 0x03200000 || /* MSR */ + (insn & 0x0ff00000) == 0x03400000 || /* Undef */ (insn & 0x0e10f000) == 0x0210f000) /* ALU s-bit, R15 */ return INSN_REJECTED; @@ -1177,7 +1178,7 @@ space_cccc_001x(kprobe_opcode_t insn, struct arch_specific_insn *asi) * *S (bit 20) updates condition codes * ADC/SBC/RSC reads the C flag */ - insn &= 0xfff00fff; /* Rn = r0, Rd = r0 */ + insn &= 0xffff0fff; /* Rd = r0 */ asi->insn[0] = insn; asi->insn_handler = (insn & (1 << 20)) ? /* S-bit */ emulate_alu_imm_rwflags : emulate_alu_imm_rflags; diff --git a/arch/arm/mach-at91/include/mach/system.h b/arch/arm/mach-at91/include/mach/system.h index c80e090b3670..ee8db152592e 100644 --- a/arch/arm/mach-at91/include/mach/system.h +++ b/arch/arm/mach-at91/include/mach/system.h @@ -28,17 +28,16 @@ static inline void arch_idle(void) { -#ifndef CONFIG_DEBUG_KERNEL /* * Disable the processor clock. The processor will be automatically * re-enabled by an interrupt or by a reset. */ at91_sys_write(AT91_PMC_SCDR, AT91_PMC_PCK); -#else +#ifndef CONFIG_CPU_ARM920T /* * Set the processor (CP15) into 'Wait for Interrupt' mode. - * Unlike disabling the processor clock via the PMC (above) - * this allows the processor to be woken via JTAG. + * Post-RM9200 processors need this in conjunction with the above + * to save power when idle. */ cpu_do_idle(); #endif diff --git a/arch/arm/mach-bcmring/dma.c b/arch/arm/mach-bcmring/dma.c index 29c0a911df26..77eb35c89cd0 100644 --- a/arch/arm/mach-bcmring/dma.c +++ b/arch/arm/mach-bcmring/dma.c @@ -691,7 +691,7 @@ int dma_init(void) memset(&gDMA, 0, sizeof(gDMA)); - init_MUTEX_LOCKED(&gDMA.lock); + sema_init(&gDMA.lock, 0); init_waitqueue_head(&gDMA.freeChannelQ); /* Initialize the Hardware */ @@ -1574,7 +1574,7 @@ int dma_init_mem_map(DMA_MemMap_t *memMap) { memset(memMap, 0, sizeof(*memMap)); - init_MUTEX(&memMap->lock); + sema_init(&memMap->lock, 1); return 0; } diff --git a/arch/arm/mach-ep93xx/dma-m2p.c b/arch/arm/mach-ep93xx/dma-m2p.c index 8904ca4e2e24..a696d354b1f8 100644 --- a/arch/arm/mach-ep93xx/dma-m2p.c +++ b/arch/arm/mach-ep93xx/dma-m2p.c @@ -276,7 +276,7 @@ static void channel_disable(struct m2p_channel *ch) v &= ~(M2P_CONTROL_STALL_IRQ_EN | M2P_CONTROL_NFB_IRQ_EN); m2p_set_control(ch, v); - while (m2p_channel_state(ch) == STATE_ON) + while (m2p_channel_state(ch) >= STATE_ON) cpu_relax(); m2p_set_control(ch, 0x0); diff --git a/arch/arm/mach-imx/Kconfig b/arch/arm/mach-imx/Kconfig index c5c0369bb481..2f7e2728970d 100644 --- a/arch/arm/mach-imx/Kconfig +++ b/arch/arm/mach-imx/Kconfig @@ -122,6 +122,7 @@ config MACH_CPUIMX27 select IMX_HAVE_PLATFORM_IMX_I2C select IMX_HAVE_PLATFORM_IMX_UART select IMX_HAVE_PLATFORM_MXC_NAND + select MXC_ULPI if USB_ULPI help Include support for Eukrea CPUIMX27 platform. This includes specific configurations for the module and its peripherals. diff --git a/arch/arm/mach-imx/mach-cpuimx27.c b/arch/arm/mach-imx/mach-cpuimx27.c index 339150ab0ea5..6830afd1d2ba 100644 --- a/arch/arm/mach-imx/mach-cpuimx27.c +++ b/arch/arm/mach-imx/mach-cpuimx27.c @@ -259,7 +259,7 @@ static void __init eukrea_cpuimx27_init(void) i2c_register_board_info(0, eukrea_cpuimx27_i2c_devices, ARRAY_SIZE(eukrea_cpuimx27_i2c_devices)); - imx27_add_i2c_imx1(&cpuimx27_i2c1_data); + imx27_add_i2c_imx0(&cpuimx27_i2c1_data); platform_add_devices(platform_devices, ARRAY_SIZE(platform_devices)); diff --git a/arch/arm/mach-s5p6440/cpu.c b/arch/arm/mach-s5p6440/cpu.c index 526f33adb31d..ec592e866054 100644 --- a/arch/arm/mach-s5p6440/cpu.c +++ b/arch/arm/mach-s5p6440/cpu.c @@ -19,6 +19,7 @@ #include <linux/sysdev.h> #include <linux/serial_core.h> #include <linux/platform_device.h> +#include <linux/sched.h> #include <asm/mach/arch.h> #include <asm/mach/map.h> diff --git a/arch/arm/mach-s5p6442/cpu.c b/arch/arm/mach-s5p6442/cpu.c index a48fb553fd01..70ac681af72b 100644 --- a/arch/arm/mach-s5p6442/cpu.c +++ b/arch/arm/mach-s5p6442/cpu.c @@ -19,6 +19,7 @@ #include <linux/sysdev.h> #include <linux/serial_core.h> #include <linux/platform_device.h> +#include <linux/sched.h> #include <asm/mach/arch.h> #include <asm/mach/map.h> diff --git a/arch/arm/mach-s5pc100/cpu.c b/arch/arm/mach-s5pc100/cpu.c index 251c92ac5b22..cd1afbce83e2 100644 --- a/arch/arm/mach-s5pc100/cpu.c +++ b/arch/arm/mach-s5pc100/cpu.c @@ -21,6 +21,7 @@ #include <linux/sysdev.h> #include <linux/serial_core.h> #include <linux/platform_device.h> +#include <linux/sched.h> #include <asm/mach/arch.h> #include <asm/mach/map.h> diff --git a/arch/arm/mach-s5pv210/clock.c b/arch/arm/mach-s5pv210/clock.c index cfecd70657cb..d562670e1b0b 100644 --- a/arch/arm/mach-s5pv210/clock.c +++ b/arch/arm/mach-s5pv210/clock.c @@ -173,11 +173,6 @@ static int s5pv210_clk_ip3_ctrl(struct clk *clk, int enable) return s5p_gatectrl(S5P_CLKGATE_IP3, clk, enable); } -static int s5pv210_clk_ip4_ctrl(struct clk *clk, int enable) -{ - return s5p_gatectrl(S5P_CLKGATE_IP4, clk, enable); -} - static int s5pv210_clk_mask0_ctrl(struct clk *clk, int enable) { return s5p_gatectrl(S5P_CLK_SRC_MASK0, clk, enable); diff --git a/arch/arm/mach-s5pv210/cpu.c b/arch/arm/mach-s5pv210/cpu.c index 77f456c91ad3..245b82b53df4 100644 --- a/arch/arm/mach-s5pv210/cpu.c +++ b/arch/arm/mach-s5pv210/cpu.c @@ -19,6 +19,7 @@ #include <linux/io.h> #include <linux/sysdev.h> #include <linux/platform_device.h> +#include <linux/sched.h> #include <asm/mach/arch.h> #include <asm/mach/map.h> diff --git a/arch/arm/mach-vexpress/ct-ca9x4.c b/arch/arm/mach-vexpress/ct-ca9x4.c index efb127022d42..71fb17349520 100644 --- a/arch/arm/mach-vexpress/ct-ca9x4.c +++ b/arch/arm/mach-vexpress/ct-ca9x4.c @@ -68,7 +68,7 @@ static void __init ct_ca9x4_init_irq(void) } #if 0 -static void ct_ca9x4_timer_init(void) +static void __init ct_ca9x4_timer_init(void) { writel(0, MMIO_P2V(CT_CA9X4_TIMER0) + TIMER_CTRL); writel(0, MMIO_P2V(CT_CA9X4_TIMER1) + TIMER_CTRL); @@ -222,7 +222,7 @@ static struct platform_device pmu_device = { .resource = pmu_resources, }; -static void ct_ca9x4_init(void) +static void __init ct_ca9x4_init(void) { int i; diff --git a/arch/arm/mach-vexpress/v2m.c b/arch/arm/mach-vexpress/v2m.c index 817f0ad38a0b..7eaa232180a5 100644 --- a/arch/arm/mach-vexpress/v2m.c +++ b/arch/arm/mach-vexpress/v2m.c @@ -48,7 +48,7 @@ void __init v2m_map_io(struct map_desc *tile, size_t num) } -static void v2m_timer_init(void) +static void __init v2m_timer_init(void) { writel(0, MMIO_P2V(V2M_TIMER0) + TIMER_CTRL); writel(0, MMIO_P2V(V2M_TIMER1) + TIMER_CTRL); diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index ab506272b2d3..17e7b0b57e49 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -204,8 +204,12 @@ void __iomem * __arm_ioremap_pfn_caller(unsigned long pfn, /* * Don't allow RAM to be mapped - this causes problems with ARMv6+ */ - if (WARN_ON(pfn_valid(pfn))) - return NULL; + if (pfn_valid(pfn)) { + printk(KERN_WARNING "BUG: Your driver calls ioremap() on system memory. This leads\n" + KERN_WARNING "to architecturally unpredictable behaviour on ARMv6+, and ioremap()\n" + KERN_WARNING "will fail in the next kernel release. Please fix your driver.\n"); + WARN_ON(1); + } type = get_mem_type(mtype); if (!type) diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 6a3a2d0cd6db..e8ed9dc461fe 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -248,7 +248,7 @@ static struct mem_type mem_types[] = { }, [MT_MEMORY] = { .prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | - L_PTE_USER | L_PTE_EXEC, + L_PTE_WRITE | L_PTE_EXEC, .prot_l1 = PMD_TYPE_TABLE, .prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE, .domain = DOMAIN_KERNEL, @@ -259,7 +259,7 @@ static struct mem_type mem_types[] = { }, [MT_MEMORY_NONCACHED] = { .prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | - L_PTE_USER | L_PTE_EXEC | L_PTE_MT_BUFFERABLE, + L_PTE_WRITE | L_PTE_EXEC | L_PTE_MT_BUFFERABLE, .prot_l1 = PMD_TYPE_TABLE, .prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE, .domain = DOMAIN_KERNEL, diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S index 7563ff0141bd..197f21bed5e9 100644 --- a/arch/arm/mm/proc-v7.S +++ b/arch/arm/mm/proc-v7.S @@ -253,6 +253,14 @@ __v7_setup: orreq r10, r10, #1 << 22 @ set bit #22 mcreq p15, 0, r10, c15, c0, 1 @ write diagnostic register #endif +#ifdef CONFIG_ARM_ERRATA_743622 + teq r6, #0x20 @ present in r2p0 + teqne r6, #0x21 @ present in r2p1 + teqne r6, #0x22 @ present in r2p2 + mrceq p15, 0, r10, c15, c0, 1 @ read diagnostic register + orreq r10, r10, #1 << 6 @ set bit #6 + mcreq p15, 0, r10, c15, c0, 1 @ write diagnostic register +#endif 3: mov r10, #0 #ifdef HARVARD_CACHE @@ -365,7 +373,7 @@ __v7_ca9mp_proc_info: b __v7_ca9mp_setup .long cpu_arch_name .long cpu_elf_name - .long HWCAP_SWP|HWCAP_HALF|HWCAP_THUMB|HWCAP_FAST_MULT|HWCAP_EDSP + .long HWCAP_SWP|HWCAP_HALF|HWCAP_THUMB|HWCAP_FAST_MULT|HWCAP_EDSP|HWCAP_TLS .long cpu_v7_name .long v7_processor_functions .long v7wbi_tlb_fns diff --git a/arch/arm/plat-omap/iommu.c b/arch/arm/plat-omap/iommu.c index a202a2ce6e3d..6cd151b31bc5 100644 --- a/arch/arm/plat-omap/iommu.c +++ b/arch/arm/plat-omap/iommu.c @@ -320,6 +320,7 @@ void flush_iotlb_page(struct iommu *obj, u32 da) if ((start <= da) && (da < start + bytes)) { dev_dbg(obj->dev, "%s: %08x<=%08x(%x)\n", __func__, start, da, bytes); + iotlb_load_cr(obj, &cr); iommu_write_reg(obj, 1, MMU_FLUSH_ENTRY); } } diff --git a/arch/arm/plat-samsung/adc.c b/arch/arm/plat-samsung/adc.c index 04d9521ddc9f..e8f2be2d67f2 100644 --- a/arch/arm/plat-samsung/adc.c +++ b/arch/arm/plat-samsung/adc.c @@ -435,7 +435,6 @@ static int s3c_adc_suspend(struct platform_device *pdev, pm_message_t state) static int s3c_adc_resume(struct platform_device *pdev) { struct adc_device *adc = platform_get_drvdata(pdev); - unsigned long flags; clk_enable(adc->clk); enable_irq(adc->irq); diff --git a/arch/arm/plat-samsung/clock.c b/arch/arm/plat-samsung/clock.c index 90a20512d68d..e8d20b0bc50e 100644 --- a/arch/arm/plat-samsung/clock.c +++ b/arch/arm/plat-samsung/clock.c @@ -48,6 +48,9 @@ #include <plat/clock.h> #include <plat/cpu.h> +#include <linux/serial_core.h> +#include <plat/regs-serial.h> /* for s3c24xx_uart_devs */ + /* clock information */ static LIST_HEAD(clocks); @@ -65,6 +68,28 @@ static int clk_null_enable(struct clk *clk, int enable) return 0; } +static int dev_is_s3c_uart(struct device *dev) +{ + struct platform_device **pdev = s3c24xx_uart_devs; + int i; + for (i = 0; i < ARRAY_SIZE(s3c24xx_uart_devs); i++, pdev++) + if (*pdev && dev == &(*pdev)->dev) + return 1; + return 0; +} + +/* + * Serial drivers call get_clock() very early, before platform bus + * has been set up, this requires a special check to let them get + * a proper clock + */ + +static int dev_is_platform_device(struct device *dev) +{ + return dev->bus == &platform_bus_type || + (dev->bus == NULL && dev_is_s3c_uart(dev)); +} + /* Clock API calls */ struct clk *clk_get(struct device *dev, const char *id) @@ -73,7 +98,7 @@ struct clk *clk_get(struct device *dev, const char *id) struct clk *clk = ERR_PTR(-ENOENT); int idno; - if (dev == NULL || dev->bus != &platform_bus_type) + if (dev == NULL || !dev_is_platform_device(dev)) idno = -1; else idno = to_platform_device(dev)->id; diff --git a/arch/m32r/include/asm/elf.h b/arch/m32r/include/asm/elf.h index 2f85412ef730..b8da7d0574d2 100644 --- a/arch/m32r/include/asm/elf.h +++ b/arch/m32r/include/asm/elf.h @@ -82,9 +82,9 @@ typedef elf_fpreg_t elf_fpregset_t; * These are used to set parameters in the core dumps. */ #define ELF_CLASS ELFCLASS32 -#if defined(__LITTLE_ENDIAN) +#if defined(__LITTLE_ENDIAN__) #define ELF_DATA ELFDATA2LSB -#elif defined(__BIG_ENDIAN) +#elif defined(__BIG_ENDIAN__) #define ELF_DATA ELFDATA2MSB #else #error no endian defined diff --git a/arch/m32r/kernel/.gitignore b/arch/m32r/kernel/.gitignore new file mode 100644 index 000000000000..c5f676c3c224 --- /dev/null +++ b/arch/m32r/kernel/.gitignore @@ -0,0 +1 @@ +vmlinux.lds diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c index 7bbe38645ed5..a08697f0886d 100644 --- a/arch/m32r/kernel/signal.c +++ b/arch/m32r/kernel/signal.c @@ -28,6 +28,8 @@ #define DEBUG_SIG 0 +#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) + asmlinkage int sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, unsigned long r2, unsigned long r3, unsigned long r4, @@ -254,7 +256,7 @@ give_sigsegv: static int prev_insn(struct pt_regs *regs) { u16 inst; - if (get_user(&inst, (u16 __user *)(regs->bpc - 2))) + if (get_user(inst, (u16 __user *)(regs->bpc - 2))) return -EFAULT; if ((inst & 0xfff0) == 0x10f0) /* trap ? */ regs->bpc -= 2; diff --git a/arch/mips/Kbuild b/arch/mips/Kbuild index e322d65f33a4..7dd65cfae837 100644 --- a/arch/mips/Kbuild +++ b/arch/mips/Kbuild @@ -7,6 +7,10 @@ subdir-ccflags-y := -Werror include arch/mips/Kbuild.platforms obj-y := $(platform-y) +# make clean traverses $(obj-) without having included .config, so +# everything ends up here +obj- := $(platform-) + # mips object files # The object files are linked as core-y files would be linked diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 5526faabfc21..4c9f402295dd 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -881,11 +881,15 @@ config NO_IOPORT config GENERIC_ISA_DMA bool select ZONE_DMA if GENERIC_ISA_DMA_SUPPORT_BROKEN=n + select ISA_DMA_API config GENERIC_ISA_DMA_SUPPORT_BROKEN bool select GENERIC_ISA_DMA +config ISA_DMA_API + bool + config GENERIC_GPIO bool diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile index 5fd7f7a58b7e..5042d51b0512 100644 --- a/arch/mips/boot/compressed/Makefile +++ b/arch/mips/boot/compressed/Makefile @@ -105,4 +105,4 @@ OBJCOPYFLAGS_vmlinuz.srec := $(OBJCOPYFLAGS) -S -O srec vmlinuz.srec: vmlinuz $(call cmd,objcopy) -clean-files := $(objtree)/vmlinuz.* +clean-files := $(objtree)/vmlinuz $(objtree)/vmlinuz.{32,ecoff,bin,srec} diff --git a/arch/mips/dec/Platform b/arch/mips/dec/Platform index 3adbcbd95db1..cf55a6f4e720 100644 --- a/arch/mips/dec/Platform +++ b/arch/mips/dec/Platform @@ -1,7 +1,7 @@ # # DECstation family # -platform-$(CONFIG_MACH_DECSTATION) = dec/ +platform-$(CONFIG_MACH_DECSTATION) += dec/ cflags-$(CONFIG_MACH_DECSTATION) += \ -I$(srctree)/arch/mips/include/asm/mach-dec libs-$(CONFIG_MACH_DECSTATION) += arch/mips/dec/prom/ diff --git a/arch/mips/include/asm/fcntl.h b/arch/mips/include/asm/fcntl.h index e482fe90fe88..75eddedcfc3e 100644 --- a/arch/mips/include/asm/fcntl.h +++ b/arch/mips/include/asm/fcntl.h @@ -56,6 +56,7 @@ */ #ifdef CONFIG_32BIT +#include <linux/types.h> struct flock { short l_type; diff --git a/arch/mips/include/asm/siginfo.h b/arch/mips/include/asm/siginfo.h index 96e28f18dad1..1ca64b4d33d9 100644 --- a/arch/mips/include/asm/siginfo.h +++ b/arch/mips/include/asm/siginfo.h @@ -88,6 +88,7 @@ typedef struct siginfo { #ifdef __ARCH_SI_TRAPNO int _trapno; /* TRAP # which caused the signal */ #endif + short _addr_lsb; } _sigfault; /* SIGPOLL, SIGXFSZ (To do ...) */ diff --git a/arch/mips/jz4740/Platform b/arch/mips/jz4740/Platform index 6a97230e3d05..ba91be9c21ef 100644 --- a/arch/mips/jz4740/Platform +++ b/arch/mips/jz4740/Platform @@ -1,3 +1,3 @@ -core-$(CONFIG_MACH_JZ4740) += arch/mips/jz4740/ +platform-$(CONFIG_MACH_JZ4740) += jz4740/ cflags-$(CONFIG_MACH_JZ4740) += -I$(srctree)/arch/mips/include/asm/mach-jz4740 load-$(CONFIG_MACH_JZ4740) += 0xffffffff80010000 diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c index 0176ed015c89..32103cc2a257 100644 --- a/arch/mips/kernel/branch.c +++ b/arch/mips/kernel/branch.c @@ -40,7 +40,6 @@ int __compute_return_epc(struct pt_regs *regs) return -EFAULT; } - regs->regs[0] = 0; switch (insn.i_format.opcode) { /* * jr and jalr are in r_format format. diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c index 2340f11dc29c..9a526ba6f257 100644 --- a/arch/mips/kernel/mips-mt-fpaff.c +++ b/arch/mips/kernel/mips-mt-fpaff.c @@ -103,7 +103,7 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len, if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) goto out_unlock; - retval = security_task_setscheduler(p, 0, NULL); + retval = security_task_setscheduler(p) if (retval) goto out_unlock; diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index c51b95ff8644..c8777333e198 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -536,7 +536,7 @@ asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit) { /* do the secure computing check first */ if (!entryexit) - secure_computing(regs->regs[0]); + secure_computing(regs->regs[2]); if (unlikely(current->audit_context) && entryexit) audit_syscall_exit(AUDITSC_RESULT(regs->regs[2]), @@ -565,7 +565,7 @@ asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit) out: if (unlikely(current->audit_context) && !entryexit) - audit_syscall_entry(audit_arch(), regs->regs[0], + audit_syscall_entry(audit_arch(), regs->regs[2], regs->regs[4], regs->regs[5], regs->regs[6], regs->regs[7]); } diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 584415eef8c9..fbaabad0e6e2 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -63,9 +63,9 @@ stack_done: sw t0, PT_R7(sp) # set error flag beqz t0, 1f + lw t1, PT_R2(sp) # syscall number negu v0 # error - sw v0, PT_R0(sp) # set flag for syscall - # restarting + sw t1, PT_R0(sp) # save it for syscall restarting 1: sw v0, PT_R2(sp) # result o32_syscall_exit: @@ -104,9 +104,9 @@ syscall_trace_entry: sw t0, PT_R7(sp) # set error flag beqz t0, 1f + lw t1, PT_R2(sp) # syscall number negu v0 # error - sw v0, PT_R0(sp) # set flag for syscall - # restarting + sw t1, PT_R0(sp) # save it for syscall restarting 1: sw v0, PT_R2(sp) # result j syscall_exit @@ -169,8 +169,7 @@ stackargs: * We probably should handle this case a bit more drastic. */ bad_stack: - negu v0 # error - sw v0, PT_R0(sp) + li v0, EFAULT sw v0, PT_R2(sp) li t0, 1 # set error flag sw t0, PT_R7(sp) diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S index 5573f8e4e326..3f4179283207 100644 --- a/arch/mips/kernel/scall64-64.S +++ b/arch/mips/kernel/scall64-64.S @@ -66,9 +66,9 @@ NESTED(handle_sys64, PT_SIZE, sp) sd t0, PT_R7(sp) # set error flag beqz t0, 1f + ld t1, PT_R2(sp) # syscall number dnegu v0 # error - sd v0, PT_R0(sp) # set flag for syscall - # restarting + sd t1, PT_R0(sp) # save it for syscall restarting 1: sd v0, PT_R2(sp) # result n64_syscall_exit: @@ -109,8 +109,9 @@ syscall_trace_entry: sd t0, PT_R7(sp) # set error flag beqz t0, 1f + ld t1, PT_R2(sp) # syscall number dnegu v0 # error - sd v0, PT_R0(sp) # set flag for syscall restarting + sd t1, PT_R0(sp) # save it for syscall restarting 1: sd v0, PT_R2(sp) # result j syscall_exit diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index 1e38ec97672e..f08ece6d8acc 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -65,8 +65,9 @@ NESTED(handle_sysn32, PT_SIZE, sp) sd t0, PT_R7(sp) # set error flag beqz t0, 1f + ld t1, PT_R2(sp) # syscall number dnegu v0 # error - sd v0, PT_R0(sp) # set flag for syscall restarting + sd t1, PT_R0(sp) # save it for syscall restarting 1: sd v0, PT_R2(sp) # result local_irq_disable # make sure need_resched and @@ -106,8 +107,9 @@ n32_syscall_trace_entry: sd t0, PT_R7(sp) # set error flag beqz t0, 1f + ld t1, PT_R2(sp) # syscall number dnegu v0 # error - sd v0, PT_R0(sp) # set flag for syscall restarting + sd t1, PT_R0(sp) # save it for syscall restarting 1: sd v0, PT_R2(sp) # result j syscall_exit @@ -320,10 +322,10 @@ EXPORT(sysn32_call_table) PTR sys_cacheflush PTR sys_cachectl PTR sys_sysmips - PTR sys_io_setup /* 6200 */ + PTR compat_sys_io_setup /* 6200 */ PTR sys_io_destroy - PTR sys_io_getevents - PTR sys_io_submit + PTR compat_sys_io_getevents + PTR compat_sys_io_submit PTR sys_io_cancel PTR sys_exit_group /* 6205 */ PTR sys_lookup_dcookie diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index 171979fc98e5..78d768a3e19d 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -93,8 +93,9 @@ NESTED(handle_sys, PT_SIZE, sp) sd t0, PT_R7(sp) # set error flag beqz t0, 1f + ld t1, PT_R2(sp) # syscall number dnegu v0 # error - sd v0, PT_R0(sp) # flag for syscall restarting + sd t1, PT_R0(sp) # save it for syscall restarting 1: sd v0, PT_R2(sp) # result o32_syscall_exit: @@ -142,8 +143,9 @@ trace_a_syscall: sd t0, PT_R7(sp) # set error flag beqz t0, 1f + ld t1, PT_R2(sp) # syscall number dnegu v0 # error - sd v0, PT_R0(sp) # set flag for syscall restarting + sd t1, PT_R0(sp) # save it for syscall restarting 1: sd v0, PT_R2(sp) # result j syscall_exit @@ -154,8 +156,7 @@ trace_a_syscall: * The stackpointer for a call with more than 4 arguments is bad. */ bad_stack: - dnegu v0 # error - sd v0, PT_R0(sp) + li v0, EFAULT sd v0, PT_R2(sp) li t0, 1 # set error flag sd t0, PT_R7(sp) @@ -444,10 +445,10 @@ sys_call_table: PTR compat_sys_futex PTR compat_sys_sched_setaffinity PTR compat_sys_sched_getaffinity /* 4240 */ - PTR sys_io_setup + PTR compat_sys_io_setup PTR sys_io_destroy - PTR sys_io_getevents - PTR sys_io_submit + PTR compat_sys_io_getevents + PTR compat_sys_io_submit PTR sys_io_cancel /* 4245 */ PTR sys_exit_group PTR sys32_lookup_dcookie diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index 2099d5a4c4b7..5922342bca39 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -390,7 +390,6 @@ asmlinkage void sys_rt_sigreturn(nabi_no_regargs struct pt_regs regs) { struct rt_sigframe __user *frame; sigset_t set; - stack_t st; int sig; frame = (struct rt_sigframe __user *) regs.regs[29]; @@ -411,11 +410,9 @@ asmlinkage void sys_rt_sigreturn(nabi_no_regargs struct pt_regs regs) else if (sig) force_sig(sig, current); - if (__copy_from_user(&st, &frame->rs_uc.uc_stack, sizeof(st))) - goto badframe; /* It is more difficult to avoid calling this function than to call it and ignore errors. */ - do_sigaltstack((stack_t __user *)&st, NULL, regs.regs[29]); + do_sigaltstack(&frame->rs_uc.uc_stack, NULL, regs.regs[29]); /* * Don't let your children do this ... @@ -550,23 +547,26 @@ static int handle_signal(unsigned long sig, siginfo_t *info, struct mips_abi *abi = current->thread.abi; void *vdso = current->mm->context.vdso; - switch(regs->regs[0]) { - case ERESTART_RESTARTBLOCK: - case ERESTARTNOHAND: - regs->regs[2] = EINTR; - break; - case ERESTARTSYS: - if (!(ka->sa.sa_flags & SA_RESTART)) { + if (regs->regs[0]) { + switch(regs->regs[2]) { + case ERESTART_RESTARTBLOCK: + case ERESTARTNOHAND: regs->regs[2] = EINTR; break; + case ERESTARTSYS: + if (!(ka->sa.sa_flags & SA_RESTART)) { + regs->regs[2] = EINTR; + break; + } + /* fallthrough */ + case ERESTARTNOINTR: + regs->regs[7] = regs->regs[26]; + regs->regs[2] = regs->regs[0]; + regs->cp0_epc -= 4; } - /* fallthrough */ - case ERESTARTNOINTR: /* Userland will reload $v0. */ - regs->regs[7] = regs->regs[26]; - regs->cp0_epc -= 8; - } - regs->regs[0] = 0; /* Don't deal with this again. */ + regs->regs[0] = 0; /* Don't deal with this again. */ + } if (sig_uses_siginfo(ka)) ret = abi->setup_rt_frame(vdso + abi->rt_signal_return_offset, @@ -575,6 +575,9 @@ static int handle_signal(unsigned long sig, siginfo_t *info, ret = abi->setup_frame(vdso + abi->signal_return_offset, ka, regs, sig, oldset); + if (ret) + return ret; + spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); if (!(ka->sa.sa_flags & SA_NODEFER)) @@ -622,17 +625,13 @@ static void do_signal(struct pt_regs *regs) return; } - /* - * Who's code doesn't conform to the restartable syscall convention - * dies here!!! The li instruction, a single machine instruction, - * must directly be followed by the syscall instruction. - */ if (regs->regs[0]) { if (regs->regs[2] == ERESTARTNOHAND || regs->regs[2] == ERESTARTSYS || regs->regs[2] == ERESTARTNOINTR) { + regs->regs[2] = regs->regs[0]; regs->regs[7] = regs->regs[26]; - regs->cp0_epc -= 8; + regs->cp0_epc -= 4; } if (regs->regs[2] == ERESTART_RESTARTBLOCK) { regs->regs[2] = current->thread.abi->restart; diff --git a/arch/mips/kernel/signal_n32.c b/arch/mips/kernel/signal_n32.c index 2c5df818c65a..ee24d814d5b9 100644 --- a/arch/mips/kernel/signal_n32.c +++ b/arch/mips/kernel/signal_n32.c @@ -109,6 +109,7 @@ asmlinkage int sysn32_rt_sigsuspend(nabi_no_regargs struct pt_regs regs) asmlinkage void sysn32_rt_sigreturn(nabi_no_regargs struct pt_regs regs) { struct rt_sigframe_n32 __user *frame; + mm_segment_t old_fs; sigset_t set; stack_t st; s32 sp; @@ -143,7 +144,11 @@ asmlinkage void sysn32_rt_sigreturn(nabi_no_regargs struct pt_regs regs) /* It is more difficult to avoid calling this function than to call it and ignore errors. */ + old_fs = get_fs(); + set_fs(KERNEL_DS); do_sigaltstack((stack_t __user *)&st, NULL, regs.regs[29]); + set_fs(old_fs); + /* * Don't let your children do this ... diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c index 69b039ca8d83..33d5a5ce4a29 100644 --- a/arch/mips/kernel/unaligned.c +++ b/arch/mips/kernel/unaligned.c @@ -109,8 +109,6 @@ static void emulate_load_store_insn(struct pt_regs *regs, unsigned long value; unsigned int res; - regs->regs[0] = 0; - /* * This load never faults. */ diff --git a/arch/um/drivers/hostaudio_kern.c b/arch/um/drivers/hostaudio_kern.c index 0c46e398cd8f..63c740a85b4c 100644 --- a/arch/um/drivers/hostaudio_kern.c +++ b/arch/um/drivers/hostaudio_kern.c @@ -40,6 +40,11 @@ static char *mixer = HOSTAUDIO_DEV_MIXER; " This is used to specify the host mixer device to the hostaudio driver.\n"\ " The default is \"" HOSTAUDIO_DEV_MIXER "\".\n\n" +module_param(dsp, charp, 0644); +MODULE_PARM_DESC(dsp, DSP_HELP); +module_param(mixer, charp, 0644); +MODULE_PARM_DESC(mixer, MIXER_HELP); + #ifndef MODULE static int set_dsp(char *name, int *add) { @@ -56,15 +61,6 @@ static int set_mixer(char *name, int *add) } __uml_setup("mixer=", set_mixer, "mixer=<mixer device>\n" MIXER_HELP); - -#else /*MODULE*/ - -module_param(dsp, charp, 0644); -MODULE_PARM_DESC(dsp, DSP_HELP); - -module_param(mixer, charp, 0644); -MODULE_PARM_DESC(mixer, MIXER_HELP); - #endif /* /dev/dsp file operations */ diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 1bcd208c459f..9734994cba1e 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -163,6 +163,7 @@ struct ubd { struct scatterlist sg[MAX_SG]; struct request *request; int start_sg, end_sg; + sector_t rq_pos; }; #define DEFAULT_COW { \ @@ -187,6 +188,7 @@ struct ubd { .request = NULL, \ .start_sg = 0, \ .end_sg = 0, \ + .rq_pos = 0, \ } /* Protected by ubd_lock */ @@ -1228,7 +1230,6 @@ static void do_ubd_request(struct request_queue *q) { struct io_thread_req *io_req; struct request *req; - sector_t sector; int n; while(1){ @@ -1239,12 +1240,12 @@ static void do_ubd_request(struct request_queue *q) return; dev->request = req; + dev->rq_pos = blk_rq_pos(req); dev->start_sg = 0; dev->end_sg = blk_rq_map_sg(q, req, dev->sg); } req = dev->request; - sector = blk_rq_pos(req); while(dev->start_sg < dev->end_sg){ struct scatterlist *sg = &dev->sg[dev->start_sg]; @@ -1256,10 +1257,9 @@ static void do_ubd_request(struct request_queue *q) return; } prepare_request(req, io_req, - (unsigned long long)sector << 9, + (unsigned long long)dev->rq_pos << 9, sg->offset, sg->length, sg_page(sg)); - sector += sg->length >> 9; n = os_write_file(thread_fd, &io_req, sizeof(struct io_thread_req *)); if(n != sizeof(struct io_thread_req *)){ @@ -1272,6 +1272,7 @@ static void do_ubd_request(struct request_queue *q) return; } + dev->rq_pos += sg->length >> 9; dev->start_sg++; } dev->end_sg = 0; diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 0350311906ae..2d93bdbc9ac0 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -34,7 +34,7 @@ #include <asm/ia32.h> #undef WARN_OLD -#undef CORE_DUMP /* probably broken */ +#undef CORE_DUMP /* definitely broken */ static int load_aout_binary(struct linux_binprm *, struct pt_regs *regs); static int load_aout_library(struct file *); @@ -131,21 +131,15 @@ static void set_brk(unsigned long start, unsigned long end) * macros to write out all the necessary info. */ -static int dump_write(struct file *file, const void *addr, int nr) -{ - return file->f_op->write(file, addr, nr, &file->f_pos) == nr; -} +#include <linux/coredump.h> #define DUMP_WRITE(addr, nr) \ if (!dump_write(file, (void *)(addr), (nr))) \ goto end_coredump; -#define DUMP_SEEK(offset) \ - if (file->f_op->llseek) { \ - if (file->f_op->llseek(file, (offset), 0) != (offset)) \ - goto end_coredump; \ - } else \ - file->f_pos = (offset) +#define DUMP_SEEK(offset) \ + if (!dump_seek(file, offset)) \ + goto end_coredump; #define START_DATA() (u.u_tsize << PAGE_SHIFT) #define START_STACK(u) (u.start_stack) @@ -217,12 +211,6 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, dump_size = dump.u_ssize << PAGE_SHIFT; DUMP_WRITE(dump_start, dump_size); } - /* - * Finally dump the task struct. Not be used by gdb, but - * could be useful - */ - set_fs(KERNEL_DS); - DUMP_WRITE(current, sizeof(*current)); end_coredump: set_fs(fs); return has_dumped; diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index 5af2982133b5..f16a2caca1e0 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h index cb030374b90a..916bc8111a01 100644 --- a/arch/x86/include/asm/amd_iommu_proto.h +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009 Advanced Micro Devices, Inc. + * Copyright (C) 2009-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * * This program is free software; you can redistribute it and/or modify it diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 08616180deaf..e3509fc303bf 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -416,13 +416,22 @@ struct amd_iommu { struct dma_ops_domain *default_dom; /* - * This array is required to work around a potential BIOS bug. - * The BIOS may miss to restore parts of the PCI configuration - * space when the system resumes from S3. The result is that the - * IOMMU does not execute commands anymore which leads to system - * failure. + * We can't rely on the BIOS to restore all values on reinit, so we + * need to stash them */ - u32 cache_cfg[4]; + + /* The iommu BAR */ + u32 stored_addr_lo; + u32 stored_addr_hi; + + /* + * Each iommu has 6 l1s, each of which is documented as having 0x12 + * registers + */ + u32 stored_l1[6][0x12]; + + /* The l2 indirect registers */ + u32 stored_l2[0x83]; }; /* diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 4ac5b0f33fc1..bf357f9b25f0 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -17,6 +17,7 @@ extern int fix_aperture; #define GARTEN (1<<0) #define DISGARTCPU (1<<4) #define DISGARTIO (1<<5) +#define DISTLBWALKPRB (1<<6) /* GART cache control register bits. */ #define INVGART (1<<0) @@ -27,7 +28,6 @@ extern int fix_aperture; #define AMD64_GARTAPERTUREBASE 0x94 #define AMD64_GARTTABLEBASE 0x98 #define AMD64_GARTCACHECTL 0x9c -#define AMD64_GARTEN (1<<0) #ifdef CONFIG_GART_IOMMU extern int gart_iommu_aperture; @@ -57,6 +57,19 @@ static inline void gart_iommu_hole_init(void) extern int agp_amd64_init(void); +static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order) +{ + u32 ctl; + + /* + * Don't enable translation but enable GART IO and CPU accesses. + * Also, set DISTLBWALKPRB since GART tables memory is UC. + */ + ctl = DISTLBWALKPRB | order << 1; + + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); +} + static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) { u32 tmp, ctl; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 502e53f999cf..c52e2eb40a1e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -652,20 +652,6 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) return (struct kvm_mmu_page *)page_private(page); } -static inline u16 kvm_read_fs(void) -{ - u16 seg; - asm("mov %%fs, %0" : "=g"(seg)); - return seg; -} - -static inline u16 kvm_read_gs(void) -{ - u16 seg; - asm("mov %%gs, %0" : "=g"(seg)); - return seg; -} - static inline u16 kvm_read_ldt(void) { u16 ldt; @@ -673,16 +659,6 @@ static inline u16 kvm_read_ldt(void) return ldt; } -static inline void kvm_load_fs(u16 sel) -{ - asm("mov %0, %%fs" : : "rm"(sel)); -} - -static inline void kvm_load_gs(u16 sel) -{ - asm("mov %0, %%gs" : : "rm"(sel)); -} - static inline void kvm_load_ldt(u16 sel) { asm("lldt %0" : : "rm"(sel)); diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 679b6450382b..d2fdb0826df2 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 5a170cbbbed8..3cb482e123de 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -194,6 +194,39 @@ static inline unsigned long tbl_size(int entry_size) return 1UL << shift; } +/* Access to l1 and l2 indexed register spaces */ + +static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address) +{ + u32 val; + + pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16)); + pci_read_config_dword(iommu->dev, 0xfc, &val); + return val; +} + +static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val) +{ + pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31)); + pci_write_config_dword(iommu->dev, 0xfc, val); + pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16)); +} + +static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address) +{ + u32 val; + + pci_write_config_dword(iommu->dev, 0xf0, address); + pci_read_config_dword(iommu->dev, 0xf4, &val); + return val; +} + +static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val) +{ + pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8)); + pci_write_config_dword(iommu->dev, 0xf4, val); +} + /**************************************************************************** * * AMD IOMMU MMIO register space handling functions @@ -619,6 +652,7 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) { int cap_ptr = iommu->cap_ptr; u32 range, misc; + int i, j; pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, &iommu->cap); @@ -633,12 +667,29 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) MMIO_GET_LD(range)); iommu->evt_msi_num = MMIO_MSI_NUM(misc); - if (is_rd890_iommu(iommu->dev)) { - pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]); - pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]); - pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]); - pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]); - } + if (!is_rd890_iommu(iommu->dev)) + return; + + /* + * Some rd890 systems may not be fully reconfigured by the BIOS, so + * it's necessary for us to store this information so it can be + * reprogrammed on resume + */ + + pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4, + &iommu->stored_addr_lo); + pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8, + &iommu->stored_addr_hi); + + /* Low bit locks writes to configuration space */ + iommu->stored_addr_lo &= ~1; + + for (i = 0; i < 6; i++) + for (j = 0; j < 0x12; j++) + iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j); + + for (i = 0; i < 0x83; i++) + iommu->stored_l2[i] = iommu_read_l2(iommu, i); } /* @@ -1127,14 +1178,53 @@ static void iommu_init_flags(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_COHERENT_EN); } -static void iommu_apply_quirks(struct amd_iommu *iommu) +static void iommu_apply_resume_quirks(struct amd_iommu *iommu) { - if (is_rd890_iommu(iommu->dev)) { - pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]); - pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]); - pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]); - pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]); - } + int i, j; + u32 ioc_feature_control; + struct pci_dev *pdev = NULL; + + /* RD890 BIOSes may not have completely reconfigured the iommu */ + if (!is_rd890_iommu(iommu->dev)) + return; + + /* + * First, we need to ensure that the iommu is enabled. This is + * controlled by a register in the northbridge + */ + pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0)); + + if (!pdev) + return; + + /* Select Northbridge indirect register 0x75 and enable writing */ + pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7)); + pci_read_config_dword(pdev, 0x64, &ioc_feature_control); + + /* Enable the iommu */ + if (!(ioc_feature_control & 0x1)) + pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1); + + pci_dev_put(pdev); + + /* Restore the iommu BAR */ + pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, + iommu->stored_addr_lo); + pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8, + iommu->stored_addr_hi); + + /* Restore the l1 indirect regs for each of the 6 l1s */ + for (i = 0; i < 6; i++) + for (j = 0; j < 0x12; j++) + iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]); + + /* Restore the l2 indirect regs */ + for (i = 0; i < 0x83; i++) + iommu_write_l2(iommu, i, iommu->stored_l2[i]); + + /* Lock PCI setup registers */ + pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, + iommu->stored_addr_lo | 1); } /* @@ -1147,7 +1237,6 @@ static void enable_iommus(void) for_each_iommu(iommu) { iommu_disable(iommu); - iommu_apply_quirks(iommu); iommu_init_flags(iommu); iommu_set_device_table(iommu); iommu_enable_command_buffer(iommu); @@ -1173,6 +1262,11 @@ static void disable_iommus(void) static int amd_iommu_resume(struct sys_device *dev) { + struct amd_iommu *iommu; + + for_each_iommu(iommu) + iommu_apply_resume_quirks(iommu); + /* re-load the hardware */ enable_iommus(); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index a2e0caf26e17..c9cb17368448 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -307,7 +307,7 @@ void __init early_gart_iommu_check(void) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); - aper_enabled = ctl & AMD64_GARTEN; + aper_enabled = ctl & GARTEN; aper_order = (ctl >> 1) & 7; aper_size = (32 * 1024 * 1024) << aper_order; aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; @@ -362,7 +362,7 @@ void __init early_gart_iommu_check(void) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); - ctl &= ~AMD64_GARTEN; + ctl &= ~GARTEN; write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); } } @@ -505,8 +505,13 @@ out: /* Fix up the north bridges */ for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { - int bus; - int dev_base, dev_limit; + int bus, dev_base, dev_limit; + + /* + * Don't enable translation yet but enable GART IO and CPU + * accesses and set DISTLBWALKPRB since GART table memory is UC. + */ + u32 ctl = DISTLBWALKPRB | aper_order << 1; bus = bus_dev_ranges[i].bus; dev_base = bus_dev_ranges[i].dev_base; @@ -515,10 +520,7 @@ out: if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) continue; - /* Don't enable translation yet. That is done later. - Assume this BIOS didn't initialise the GART so - just overwrite all previous bits */ - write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1); + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); } } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 5e975298fa81..39aaee5c1ab2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -141,6 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) address = (low & MASK_BLKPTR_LO) >> 21; if (!address) break; + address += MCG_XBLK_ADDR; } else ++address; @@ -148,12 +149,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (rdmsr_safe(address, &low, &high)) break; - if (!(high & MASK_VALID_HI)) { - if (block) - continue; - else - break; - } + if (!(high & MASK_VALID_HI)) + continue; if (!(high & MASK_CNTP_HI) || (high & MASK_LOCKED_HI)) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d9368eeda309..169d8804a9f8 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -216,7 +216,7 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, err = sysfs_add_file_to_group(&sys_dev->kobj, &attr_core_power_limit_count.attr, thermal_attr_group.name); - if (cpu_has(c, X86_FEATURE_PTS)) + if (cpu_has(c, X86_FEATURE_PTS)) { err = sysfs_add_file_to_group(&sys_dev->kobj, &attr_package_throttle_count.attr, thermal_attr_group.name); @@ -224,6 +224,7 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, err = sysfs_add_file_to_group(&sys_dev->kobj, &attr_package_power_limit_count.attr, thermal_attr_group.name); + } return err; } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 0f7f130caa67..6015ee13e22b 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -601,7 +601,7 @@ static void gart_fixup_northbridges(struct sys_device *dev) * Don't enable translations just yet. That is the next * step. Restore the pre-suspend aperture settings. */ - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1); + gart_set_size_and_enable(dev, aperture_order); pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); } } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index bc5b9b8d4a33..8a3f9f64f86f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -766,7 +766,6 @@ static void init_vmcb(struct vcpu_svm *svm) control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __pa(svm->msrpm); - control->tsc_offset = 0; control->int_ctl = V_INTR_MASKING_MASK; init_seg(&save->es); @@ -902,6 +901,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; init_vmcb(svm); + svm->vmcb->control.tsc_offset = 0-native_read_tsc(); err = fx_init(&svm->vcpu); if (err) @@ -3163,8 +3163,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) sync_lapic_to_cr8(vcpu); save_host_msrs(vcpu); - fs_selector = kvm_read_fs(); - gs_selector = kvm_read_gs(); + savesegment(fs, fs_selector); + savesegment(gs, gs_selector); ldt_selector = kvm_read_ldt(); svm->vmcb->save.cr2 = vcpu->arch.cr2; /* required for live migration with NPT */ @@ -3251,10 +3251,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; - kvm_load_fs(fs_selector); - kvm_load_gs(gs_selector); - kvm_load_ldt(ldt_selector); load_host_msrs(vcpu); + loadsegment(fs, fs_selector); +#ifdef CONFIG_X86_64 + load_gs_index(gs_selector); + wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); +#else + loadsegment(gs, gs_selector); +#endif + kvm_load_ldt(ldt_selector); reload_tss(vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 49b25eee25ac..7bddfab12013 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -803,7 +803,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) */ vmx->host_state.ldt_sel = kvm_read_ldt(); vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; - vmx->host_state.fs_sel = kvm_read_fs(); + savesegment(fs, vmx->host_state.fs_sel); if (!(vmx->host_state.fs_sel & 7)) { vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); vmx->host_state.fs_reload_needed = 0; @@ -811,7 +811,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmcs_write16(HOST_FS_SELECTOR, 0); vmx->host_state.fs_reload_needed = 1; } - vmx->host_state.gs_sel = kvm_read_gs(); + savesegment(gs, vmx->host_state.gs_sel); if (!(vmx->host_state.gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); else { @@ -841,27 +841,21 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) static void __vmx_load_host_state(struct vcpu_vmx *vmx) { - unsigned long flags; - if (!vmx->host_state.loaded) return; ++vmx->vcpu.stat.host_state_reload; vmx->host_state.loaded = 0; if (vmx->host_state.fs_reload_needed) - kvm_load_fs(vmx->host_state.fs_sel); + loadsegment(fs, vmx->host_state.fs_sel); if (vmx->host_state.gs_ldt_reload_needed) { kvm_load_ldt(vmx->host_state.ldt_sel); - /* - * If we have to reload gs, we must take care to - * preserve our gs base. - */ - local_irq_save(flags); - kvm_load_gs(vmx->host_state.gs_sel); #ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); + load_gs_index(vmx->host_state.gs_sel); + wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); +#else + loadsegment(gs, vmx->host_state.gs_sel); #endif - local_irq_restore(flags); } reload_tss(); #ifdef CONFIG_X86_64 @@ -2589,8 +2583,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */ - vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */ + vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ #ifdef CONFIG_X86_64 rdmsrl(MSR_FS_BASE, a); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index f9897f7a9ef1..9c0d0d399c30 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -420,9 +420,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return -1; } - for_each_node_mask(i, nodes_parsed) - e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, - nodes[i].end >> PAGE_SHIFT); + for (i = 0; i < num_node_memblks; i++) + e820_register_active_regions(memblk_nodeid[i], + node_memblk_range[i].start >> PAGE_SHIFT, + node_memblk_range[i].end >> PAGE_SHIFT); + /* for out of order entries in SRAT */ sort_node_map(); if (!nodes_cover_memory(nodes)) { diff --git a/block/bsg.c b/block/bsg.c index 82d58829ba59..0c00870553a3 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -426,7 +426,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, /* * fill in all the output members */ - hdr->device_status = status_byte(rq->errors); + hdr->device_status = rq->errors & 0xff; hdr->transport_status = host_byte(rq->errors); hdr->driver_status = driver_byte(rq->errors); hdr->info = 0; diff --git a/block/elevator.c b/block/elevator.c index 205b09a5bd9e..4e11559aa2b0 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -938,6 +938,7 @@ int elv_register_queue(struct request_queue *q) } } kobject_uevent(&e->kobj, KOBJ_ADD); + e->registered = 1; } return error; } @@ -947,6 +948,7 @@ static void __elv_unregister_queue(struct elevator_queue *e) { kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); + e->registered = 0; } void elv_unregister_queue(struct request_queue *q) @@ -1042,11 +1044,13 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) spin_unlock_irq(q->queue_lock); - __elv_unregister_queue(old_elevator); + if (old_elevator->registered) { + __elv_unregister_queue(old_elevator); - err = elv_register_queue(q); - if (err) - goto fail_register; + err = elv_register_queue(q); + if (err) + goto fail_register; + } /* * finally exit old elevator and turn off BYPASS. diff --git a/drivers/acpi/blacklist.c b/drivers/acpi/blacklist.c index f7619600270a..af308d03f492 100644 --- a/drivers/acpi/blacklist.c +++ b/drivers/acpi/blacklist.c @@ -204,6 +204,23 @@ static struct dmi_system_id acpi_osi_dmi_table[] __initdata = { }, }, { + /* + * There have a NVIF method in MSI GX723 DSDT need call by Nvidia + * driver (e.g. nouveau) when user press brightness hotkey. + * Currently, nouveau driver didn't do the job and it causes there + * have a infinite while loop in DSDT when user press hotkey. + * We add MSI GX723's dmi information to this table for workaround + * this issue. + * Will remove MSI GX723 from the table after nouveau grows support. + */ + .callback = dmi_disable_osi_vista, + .ident = "MSI GX723", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Micro-Star International"), + DMI_MATCH(DMI_PRODUCT_NAME, "GX723"), + }, + }, + { .callback = dmi_disable_osi_vista, .ident = "Sony VGN-NS10J_S", .matches = { diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index b618f888d66b..bec561c14beb 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -346,4 +346,5 @@ void __init acpi_early_processor_set_pdc(void) acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, ACPI_UINT32_MAX, early_init_pdc, NULL, NULL, NULL); + acpi_get_devices("ACPI0007", early_init_pdc, NULL, NULL); } diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c index ee9ddeb53417..8cb0347dec28 100644 --- a/drivers/atm/iphase.c +++ b/drivers/atm/iphase.c @@ -3156,7 +3156,6 @@ static int __devinit ia_init_one(struct pci_dev *pdev, { struct atm_dev *dev; IADEV *iadev; - unsigned long flags; int ret; iadev = kzalloc(sizeof(*iadev), GFP_KERNEL); @@ -3188,19 +3187,14 @@ static int __devinit ia_init_one(struct pci_dev *pdev, ia_dev[iadev_count] = iadev; _ia_dev[iadev_count] = dev; iadev_count++; - spin_lock_init(&iadev->misc_lock); - /* First fixes first. I don't want to think about this now. */ - spin_lock_irqsave(&iadev->misc_lock, flags); if (ia_init(dev) || ia_start(dev)) { IF_INIT(printk("IA register failed!\n");) iadev_count--; ia_dev[iadev_count] = NULL; _ia_dev[iadev_count] = NULL; - spin_unlock_irqrestore(&iadev->misc_lock, flags); ret = -EINVAL; goto err_out_deregister_dev; } - spin_unlock_irqrestore(&iadev->misc_lock, flags); IF_EVENT(printk("iadev_count = %d\n", iadev_count);) iadev->next_board = ia_boards; diff --git a/drivers/atm/iphase.h b/drivers/atm/iphase.h index b2cd20f549cb..077735e0e04b 100644 --- a/drivers/atm/iphase.h +++ b/drivers/atm/iphase.h @@ -1022,7 +1022,7 @@ typedef struct iadev_t { struct dle_q rx_dle_q; struct free_desc_q *rx_free_desc_qhead; struct sk_buff_head rx_dma_q; - spinlock_t rx_lock, misc_lock; + spinlock_t rx_lock; struct atm_vcc **rx_open; /* list of all open VCs */ u16 num_rx_desc, rx_buf_sz, rxing; u32 rx_pkt_ram, rx_tmp_cnt; diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c index f916ddf63938..f46138ab38b6 100644 --- a/drivers/atm/solos-pci.c +++ b/drivers/atm/solos-pci.c @@ -444,6 +444,7 @@ static ssize_t console_show(struct device *dev, struct device_attribute *attr, struct atm_dev *atmdev = container_of(dev, struct atm_dev, class_dev); struct solos_card *card = atmdev->dev_data; struct sk_buff *skb; + unsigned int len; spin_lock(&card->cli_queue_lock); skb = skb_dequeue(&card->cli_queue[SOLOS_CHAN(atmdev)]); @@ -451,11 +452,12 @@ static ssize_t console_show(struct device *dev, struct device_attribute *attr, if(skb == NULL) return sprintf(buf, "No data.\n"); - memcpy(buf, skb->data, skb->len); - dev_dbg(&card->dev->dev, "len: %d\n", skb->len); + len = skb->len; + memcpy(buf, skb->data, len); + dev_dbg(&card->dev->dev, "len: %d\n", len); kfree_skb(skb); - return skb->len; + return len; } static int send_command(struct solos_card *card, int dev, const char *buf, size_t size) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index de277689da61..4b9359a6f6ca 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -488,4 +488,21 @@ config BLK_DEV_HD If unsure, say N. +config BLK_DEV_RBD + tristate "Rados block device (RBD)" + depends on INET && EXPERIMENTAL && BLOCK + select CEPH_LIB + select LIBCRC32C + select CRYPTO_AES + select CRYPTO + default n + help + Say Y here if you want include the Rados block device, which stripes + a block device over objects stored in the Ceph distributed object + store. + + More information at http://ceph.newdream.net/. + + If unsure, say N. + endif # BLK_DEV diff --git a/drivers/block/Makefile b/drivers/block/Makefile index aff5ac925c34..d7f463d6312d 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -37,5 +37,6 @@ obj-$(CONFIG_BLK_DEV_HD) += hd.o obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ +obj-$(CONFIG_BLK_DEV_RBD) += rbd.o swim_mod-objs := swim.o swim_asm.o diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index e9da874d0419..03688c2da319 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -113,7 +113,7 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev, memcpy(buf, dev->bounce_buf+offset, size); offset += size; flush_kernel_dcache_page(bvec->bv_page); - bvec_kunmap_irq(bvec, &flags); + bvec_kunmap_irq(buf, &flags); i++; } } diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c new file mode 100644 index 000000000000..6ec9d53806c5 --- /dev/null +++ b/drivers/block/rbd.c @@ -0,0 +1,1841 @@ +/* + rbd.c -- Export ceph rados objects as a Linux block device + + + based on drivers/block/osdblk.c: + + Copyright 2009 Red Hat, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + + + Instructions for use + -------------------- + + 1) Map a Linux block device to an existing rbd image. + + Usage: <mon ip addr> <options> <pool name> <rbd image name> [snap name] + + $ echo "192.168.0.1 name=admin rbd foo" > /sys/class/rbd/add + + The snapshot name can be "-" or omitted to map the image read/write. + + 2) List all active blkdev<->object mappings. + + In this example, we have performed step #1 twice, creating two blkdevs, + mapped to two separate rados objects in the rados rbd pool + + $ cat /sys/class/rbd/list + #id major client_name pool name snap KB + 0 254 client4143 rbd foo - 1024000 + + The columns, in order, are: + - blkdev unique id + - blkdev assigned major + - rados client id + - rados pool name + - rados block device name + - mapped snapshot ("-" if none) + - device size in KB + + + 3) Create a snapshot. + + Usage: <blkdev id> <snapname> + + $ echo "0 mysnap" > /sys/class/rbd/snap_create + + + 4) Listing a snapshot. + + $ cat /sys/class/rbd/snaps_list + #id snap KB + 0 - 1024000 (*) + 0 foo 1024000 + + The columns, in order, are: + - blkdev unique id + - snapshot name, '-' means none (active read/write version) + - size of device at time of snapshot + - the (*) indicates this is the active version + + 5) Rollback to snapshot. + + Usage: <blkdev id> <snapname> + + $ echo "0 mysnap" > /sys/class/rbd/snap_rollback + + + 6) Mapping an image using snapshot. + + A snapshot mapping is read-only. This is being done by passing + snap=<snapname> to the options when adding a device. + + $ echo "192.168.0.1 name=admin,snap=mysnap rbd foo" > /sys/class/rbd/add + + + 7) Remove an active blkdev<->rbd image mapping. + + In this example, we remove the mapping with blkdev unique id 1. + + $ echo 1 > /sys/class/rbd/remove + + + NOTE: The actual creation and deletion of rados objects is outside the scope + of this driver. + + */ + +#include <linux/ceph/libceph.h> +#include <linux/ceph/osd_client.h> +#include <linux/ceph/mon_client.h> +#include <linux/ceph/decode.h> + +#include <linux/kernel.h> +#include <linux/device.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/blkdev.h> + +#include "rbd_types.h" + +#define DRV_NAME "rbd" +#define DRV_NAME_LONG "rbd (rados block device)" + +#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ + +#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX)) +#define RBD_MAX_POOL_NAME_LEN 64 +#define RBD_MAX_SNAP_NAME_LEN 32 +#define RBD_MAX_OPT_LEN 1024 + +#define RBD_SNAP_HEAD_NAME "-" + +#define DEV_NAME_LEN 32 + +/* + * block device image metadata (in-memory version) + */ +struct rbd_image_header { + u64 image_size; + char block_name[32]; + __u8 obj_order; + __u8 crypt_type; + __u8 comp_type; + struct rw_semaphore snap_rwsem; + struct ceph_snap_context *snapc; + size_t snap_names_len; + u64 snap_seq; + u32 total_snaps; + + char *snap_names; + u64 *snap_sizes; +}; + +/* + * an instance of the client. multiple devices may share a client. + */ +struct rbd_client { + struct ceph_client *client; + struct kref kref; + struct list_head node; +}; + +/* + * a single io request + */ +struct rbd_request { + struct request *rq; /* blk layer request */ + struct bio *bio; /* cloned bio */ + struct page **pages; /* list of used pages */ + u64 len; +}; + +/* + * a single device + */ +struct rbd_device { + int id; /* blkdev unique id */ + + int major; /* blkdev assigned major */ + struct gendisk *disk; /* blkdev's gendisk and rq */ + struct request_queue *q; + + struct ceph_client *client; + struct rbd_client *rbd_client; + + char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ + + spinlock_t lock; /* queue lock */ + + struct rbd_image_header header; + char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */ + int obj_len; + char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */ + char pool_name[RBD_MAX_POOL_NAME_LEN]; + int poolid; + + char snap_name[RBD_MAX_SNAP_NAME_LEN]; + u32 cur_snap; /* index+1 of current snapshot within snap context + 0 - for the head */ + int read_only; + + struct list_head node; +}; + +static spinlock_t node_lock; /* protects client get/put */ + +static struct class *class_rbd; /* /sys/class/rbd */ +static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ +static LIST_HEAD(rbd_dev_list); /* devices */ +static LIST_HEAD(rbd_client_list); /* clients */ + + +static int rbd_open(struct block_device *bdev, fmode_t mode) +{ + struct gendisk *disk = bdev->bd_disk; + struct rbd_device *rbd_dev = disk->private_data; + + set_device_ro(bdev, rbd_dev->read_only); + + if ((mode & FMODE_WRITE) && rbd_dev->read_only) + return -EROFS; + + return 0; +} + +static const struct block_device_operations rbd_bd_ops = { + .owner = THIS_MODULE, + .open = rbd_open, +}; + +/* + * Initialize an rbd client instance. + * We own *opt. + */ +static struct rbd_client *rbd_client_create(struct ceph_options *opt) +{ + struct rbd_client *rbdc; + int ret = -ENOMEM; + + dout("rbd_client_create\n"); + rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); + if (!rbdc) + goto out_opt; + + kref_init(&rbdc->kref); + INIT_LIST_HEAD(&rbdc->node); + + rbdc->client = ceph_create_client(opt, rbdc); + if (IS_ERR(rbdc->client)) + goto out_rbdc; + opt = NULL; /* Now rbdc->client is responsible for opt */ + + ret = ceph_open_session(rbdc->client); + if (ret < 0) + goto out_err; + + spin_lock(&node_lock); + list_add_tail(&rbdc->node, &rbd_client_list); + spin_unlock(&node_lock); + + dout("rbd_client_create created %p\n", rbdc); + return rbdc; + +out_err: + ceph_destroy_client(rbdc->client); +out_rbdc: + kfree(rbdc); +out_opt: + if (opt) + ceph_destroy_options(opt); + return ERR_PTR(ret); +} + +/* + * Find a ceph client with specific addr and configuration. + */ +static struct rbd_client *__rbd_client_find(struct ceph_options *opt) +{ + struct rbd_client *client_node; + + if (opt->flags & CEPH_OPT_NOSHARE) + return NULL; + + list_for_each_entry(client_node, &rbd_client_list, node) + if (ceph_compare_options(opt, client_node->client) == 0) + return client_node; + return NULL; +} + +/* + * Get a ceph client with specific addr and configuration, if one does + * not exist create it. + */ +static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, + char *options) +{ + struct rbd_client *rbdc; + struct ceph_options *opt; + int ret; + + ret = ceph_parse_options(&opt, options, mon_addr, + mon_addr + strlen(mon_addr), NULL, NULL); + if (ret < 0) + return ret; + + spin_lock(&node_lock); + rbdc = __rbd_client_find(opt); + if (rbdc) { + ceph_destroy_options(opt); + + /* using an existing client */ + kref_get(&rbdc->kref); + rbd_dev->rbd_client = rbdc; + rbd_dev->client = rbdc->client; + spin_unlock(&node_lock); + return 0; + } + spin_unlock(&node_lock); + + rbdc = rbd_client_create(opt); + if (IS_ERR(rbdc)) + return PTR_ERR(rbdc); + + rbd_dev->rbd_client = rbdc; + rbd_dev->client = rbdc->client; + return 0; +} + +/* + * Destroy ceph client + */ +static void rbd_client_release(struct kref *kref) +{ + struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); + + dout("rbd_release_client %p\n", rbdc); + spin_lock(&node_lock); + list_del(&rbdc->node); + spin_unlock(&node_lock); + + ceph_destroy_client(rbdc->client); + kfree(rbdc); +} + +/* + * Drop reference to ceph client node. If it's not referenced anymore, release + * it. + */ +static void rbd_put_client(struct rbd_device *rbd_dev) +{ + kref_put(&rbd_dev->rbd_client->kref, rbd_client_release); + rbd_dev->rbd_client = NULL; + rbd_dev->client = NULL; +} + + +/* + * Create a new header structure, translate header format from the on-disk + * header. + */ +static int rbd_header_from_disk(struct rbd_image_header *header, + struct rbd_image_header_ondisk *ondisk, + int allocated_snaps, + gfp_t gfp_flags) +{ + int i; + u32 snap_count = le32_to_cpu(ondisk->snap_count); + int ret = -ENOMEM; + + init_rwsem(&header->snap_rwsem); + + header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); + header->snapc = kmalloc(sizeof(struct ceph_snap_context) + + snap_count * + sizeof(struct rbd_image_snap_ondisk), + gfp_flags); + if (!header->snapc) + return -ENOMEM; + if (snap_count) { + header->snap_names = kmalloc(header->snap_names_len, + GFP_KERNEL); + if (!header->snap_names) + goto err_snapc; + header->snap_sizes = kmalloc(snap_count * sizeof(u64), + GFP_KERNEL); + if (!header->snap_sizes) + goto err_names; + } else { + header->snap_names = NULL; + header->snap_sizes = NULL; + } + memcpy(header->block_name, ondisk->block_name, + sizeof(ondisk->block_name)); + + header->image_size = le64_to_cpu(ondisk->image_size); + header->obj_order = ondisk->options.order; + header->crypt_type = ondisk->options.crypt_type; + header->comp_type = ondisk->options.comp_type; + + atomic_set(&header->snapc->nref, 1); + header->snap_seq = le64_to_cpu(ondisk->snap_seq); + header->snapc->num_snaps = snap_count; + header->total_snaps = snap_count; + + if (snap_count && + allocated_snaps == snap_count) { + for (i = 0; i < snap_count; i++) { + header->snapc->snaps[i] = + le64_to_cpu(ondisk->snaps[i].id); + header->snap_sizes[i] = + le64_to_cpu(ondisk->snaps[i].image_size); + } + + /* copy snapshot names */ + memcpy(header->snap_names, &ondisk->snaps[i], + header->snap_names_len); + } + + return 0; + +err_names: + kfree(header->snap_names); +err_snapc: + kfree(header->snapc); + return ret; +} + +static int snap_index(struct rbd_image_header *header, int snap_num) +{ + return header->total_snaps - snap_num; +} + +static u64 cur_snap_id(struct rbd_device *rbd_dev) +{ + struct rbd_image_header *header = &rbd_dev->header; + + if (!rbd_dev->cur_snap) + return 0; + + return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)]; +} + +static int snap_by_name(struct rbd_image_header *header, const char *snap_name, + u64 *seq, u64 *size) +{ + int i; + char *p = header->snap_names; + + for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) { + if (strcmp(snap_name, p) == 0) + break; + } + if (i == header->total_snaps) + return -ENOENT; + if (seq) + *seq = header->snapc->snaps[i]; + + if (size) + *size = header->snap_sizes[i]; + + return i; +} + +static int rbd_header_set_snap(struct rbd_device *dev, + const char *snap_name, + u64 *size) +{ + struct rbd_image_header *header = &dev->header; + struct ceph_snap_context *snapc = header->snapc; + int ret = -ENOENT; + + down_write(&header->snap_rwsem); + + if (!snap_name || + !*snap_name || + strcmp(snap_name, "-") == 0 || + strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) { + if (header->total_snaps) + snapc->seq = header->snap_seq; + else + snapc->seq = 0; + dev->cur_snap = 0; + dev->read_only = 0; + if (size) + *size = header->image_size; + } else { + ret = snap_by_name(header, snap_name, &snapc->seq, size); + if (ret < 0) + goto done; + + dev->cur_snap = header->total_snaps - ret; + dev->read_only = 1; + } + + ret = 0; +done: + up_write(&header->snap_rwsem); + return ret; +} + +static void rbd_header_free(struct rbd_image_header *header) +{ + kfree(header->snapc); + kfree(header->snap_names); + kfree(header->snap_sizes); +} + +/* + * get the actual striped segment name, offset and length + */ +static u64 rbd_get_segment(struct rbd_image_header *header, + const char *block_name, + u64 ofs, u64 len, + char *seg_name, u64 *segofs) +{ + u64 seg = ofs >> header->obj_order; + + if (seg_name) + snprintf(seg_name, RBD_MAX_SEG_NAME_LEN, + "%s.%012llx", block_name, seg); + + ofs = ofs & ((1 << header->obj_order) - 1); + len = min_t(u64, len, (1 << header->obj_order) - ofs); + + if (segofs) + *segofs = ofs; + + return len; +} + +/* + * bio helpers + */ + +static void bio_chain_put(struct bio *chain) +{ + struct bio *tmp; + + while (chain) { + tmp = chain; + chain = chain->bi_next; + bio_put(tmp); + } +} + +/* + * zeros a bio chain, starting at specific offset + */ +static void zero_bio_chain(struct bio *chain, int start_ofs) +{ + struct bio_vec *bv; + unsigned long flags; + void *buf; + int i; + int pos = 0; + + while (chain) { + bio_for_each_segment(bv, chain, i) { + if (pos + bv->bv_len > start_ofs) { + int remainder = max(start_ofs - pos, 0); + buf = bvec_kmap_irq(bv, &flags); + memset(buf + remainder, 0, + bv->bv_len - remainder); + bvec_kunmap_irq(buf, &flags); + } + pos += bv->bv_len; + } + + chain = chain->bi_next; + } +} + +/* + * bio_chain_clone - clone a chain of bios up to a certain length. + * might return a bio_pair that will need to be released. + */ +static struct bio *bio_chain_clone(struct bio **old, struct bio **next, + struct bio_pair **bp, + int len, gfp_t gfpmask) +{ + struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL; + int total = 0; + + if (*bp) { + bio_pair_release(*bp); + *bp = NULL; + } + + while (old_chain && (total < len)) { + tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs); + if (!tmp) + goto err_out; + + if (total + old_chain->bi_size > len) { + struct bio_pair *bp; + + /* + * this split can only happen with a single paged bio, + * split_bio will BUG_ON if this is not the case + */ + dout("bio_chain_clone split! total=%d remaining=%d" + "bi_size=%d\n", + (int)total, (int)len-total, + (int)old_chain->bi_size); + + /* split the bio. We'll release it either in the next + call, or it will have to be released outside */ + bp = bio_split(old_chain, (len - total) / 512ULL); + if (!bp) + goto err_out; + + __bio_clone(tmp, &bp->bio1); + + *next = &bp->bio2; + } else { + __bio_clone(tmp, old_chain); + *next = old_chain->bi_next; + } + + tmp->bi_bdev = NULL; + gfpmask &= ~__GFP_WAIT; + tmp->bi_next = NULL; + + if (!new_chain) { + new_chain = tail = tmp; + } else { + tail->bi_next = tmp; + tail = tmp; + } + old_chain = old_chain->bi_next; + + total += tmp->bi_size; + } + + BUG_ON(total < len); + + if (tail) + tail->bi_next = NULL; + + *old = old_chain; + + return new_chain; + +err_out: + dout("bio_chain_clone with err\n"); + bio_chain_put(new_chain); + return NULL; +} + +/* + * helpers for osd request op vectors. + */ +static int rbd_create_rw_ops(struct ceph_osd_req_op **ops, + int num_ops, + int opcode, + u32 payload_len) +{ + *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1), + GFP_NOIO); + if (!*ops) + return -ENOMEM; + (*ops)[0].op = opcode; + /* + * op extent offset and length will be set later on + * in calc_raw_layout() + */ + (*ops)[0].payload_len = payload_len; + return 0; +} + +static void rbd_destroy_ops(struct ceph_osd_req_op *ops) +{ + kfree(ops); +} + +/* + * Send ceph osd request + */ +static int rbd_do_request(struct request *rq, + struct rbd_device *dev, + struct ceph_snap_context *snapc, + u64 snapid, + const char *obj, u64 ofs, u64 len, + struct bio *bio, + struct page **pages, + int num_pages, + int flags, + struct ceph_osd_req_op *ops, + int num_reply, + void (*rbd_cb)(struct ceph_osd_request *req, + struct ceph_msg *msg)) +{ + struct ceph_osd_request *req; + struct ceph_file_layout *layout; + int ret; + u64 bno; + struct timespec mtime = CURRENT_TIME; + struct rbd_request *req_data; + struct ceph_osd_request_head *reqhead; + struct rbd_image_header *header = &dev->header; + + ret = -ENOMEM; + req_data = kzalloc(sizeof(*req_data), GFP_NOIO); + if (!req_data) + goto done; + + dout("rbd_do_request len=%lld ofs=%lld\n", len, ofs); + + down_read(&header->snap_rwsem); + + req = ceph_osdc_alloc_request(&dev->client->osdc, flags, + snapc, + ops, + false, + GFP_NOIO, pages, bio); + if (IS_ERR(req)) { + up_read(&header->snap_rwsem); + ret = PTR_ERR(req); + goto done_pages; + } + + req->r_callback = rbd_cb; + + req_data->rq = rq; + req_data->bio = bio; + req_data->pages = pages; + req_data->len = len; + + req->r_priv = req_data; + + reqhead = req->r_request->front.iov_base; + reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); + + strncpy(req->r_oid, obj, sizeof(req->r_oid)); + req->r_oid_len = strlen(req->r_oid); + + layout = &req->r_file_layout; + memset(layout, 0, sizeof(*layout)); + layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + layout->fl_stripe_count = cpu_to_le32(1); + layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + layout->fl_pg_preferred = cpu_to_le32(-1); + layout->fl_pg_pool = cpu_to_le32(dev->poolid); + ceph_calc_raw_layout(&dev->client->osdc, layout, snapid, + ofs, &len, &bno, req, ops); + + ceph_osdc_build_request(req, ofs, &len, + ops, + snapc, + &mtime, + req->r_oid, req->r_oid_len); + up_read(&header->snap_rwsem); + + ret = ceph_osdc_start_request(&dev->client->osdc, req, false); + if (ret < 0) + goto done_err; + + if (!rbd_cb) { + ret = ceph_osdc_wait_request(&dev->client->osdc, req); + ceph_osdc_put_request(req); + } + return ret; + +done_err: + bio_chain_put(req_data->bio); + ceph_osdc_put_request(req); +done_pages: + kfree(req_data); +done: + if (rq) + blk_end_request(rq, ret, len); + return ret; +} + +/* + * Ceph osd op callback + */ +static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) +{ + struct rbd_request *req_data = req->r_priv; + struct ceph_osd_reply_head *replyhead; + struct ceph_osd_op *op; + __s32 rc; + u64 bytes; + int read_op; + + /* parse reply */ + replyhead = msg->front.iov_base; + WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); + op = (void *)(replyhead + 1); + rc = le32_to_cpu(replyhead->result); + bytes = le64_to_cpu(op->extent.length); + read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ); + + dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc); + + if (rc == -ENOENT && read_op) { + zero_bio_chain(req_data->bio, 0); + rc = 0; + } else if (rc == 0 && read_op && bytes < req_data->len) { + zero_bio_chain(req_data->bio, bytes); + bytes = req_data->len; + } + + blk_end_request(req_data->rq, rc, bytes); + + if (req_data->bio) + bio_chain_put(req_data->bio); + + ceph_osdc_put_request(req); + kfree(req_data); +} + +/* + * Do a synchronous ceph osd operation + */ +static int rbd_req_sync_op(struct rbd_device *dev, + struct ceph_snap_context *snapc, + u64 snapid, + int opcode, + int flags, + struct ceph_osd_req_op *orig_ops, + int num_reply, + const char *obj, + u64 ofs, u64 len, + char *buf) +{ + int ret; + struct page **pages; + int num_pages; + struct ceph_osd_req_op *ops = orig_ops; + u32 payload_len; + + num_pages = calc_pages_for(ofs , len); + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + if (!orig_ops) { + payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0); + ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); + if (ret < 0) + goto done; + + if ((flags & CEPH_OSD_FLAG_WRITE) && buf) { + ret = ceph_copy_to_page_vector(pages, buf, ofs, len); + if (ret < 0) + goto done_ops; + } + } + + ret = rbd_do_request(NULL, dev, snapc, snapid, + obj, ofs, len, NULL, + pages, num_pages, + flags, + ops, + 2, + NULL); + if (ret < 0) + goto done_ops; + + if ((flags & CEPH_OSD_FLAG_READ) && buf) + ret = ceph_copy_from_page_vector(pages, buf, ofs, ret); + +done_ops: + if (!orig_ops) + rbd_destroy_ops(ops); +done: + ceph_release_page_vector(pages, num_pages); + return ret; +} + +/* + * Do an asynchronous ceph osd operation + */ +static int rbd_do_op(struct request *rq, + struct rbd_device *rbd_dev , + struct ceph_snap_context *snapc, + u64 snapid, + int opcode, int flags, int num_reply, + u64 ofs, u64 len, + struct bio *bio) +{ + char *seg_name; + u64 seg_ofs; + u64 seg_len; + int ret; + struct ceph_osd_req_op *ops; + u32 payload_len; + + seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); + if (!seg_name) + return -ENOMEM; + + seg_len = rbd_get_segment(&rbd_dev->header, + rbd_dev->header.block_name, + ofs, len, + seg_name, &seg_ofs); + + payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); + + ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); + if (ret < 0) + goto done; + + /* we've taken care of segment sizes earlier when we + cloned the bios. We should never have a segment + truncated at this point */ + BUG_ON(seg_len < len); + + ret = rbd_do_request(rq, rbd_dev, snapc, snapid, + seg_name, seg_ofs, seg_len, + bio, + NULL, 0, + flags, + ops, + num_reply, + rbd_req_cb); +done: + kfree(seg_name); + return ret; +} + +/* + * Request async osd write + */ +static int rbd_req_write(struct request *rq, + struct rbd_device *rbd_dev, + struct ceph_snap_context *snapc, + u64 ofs, u64 len, + struct bio *bio) +{ + return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, + 2, + ofs, len, bio); +} + +/* + * Request async osd read + */ +static int rbd_req_read(struct request *rq, + struct rbd_device *rbd_dev, + u64 snapid, + u64 ofs, u64 len, + struct bio *bio) +{ + return rbd_do_op(rq, rbd_dev, NULL, + (snapid ? snapid : CEPH_NOSNAP), + CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, + 2, + ofs, len, bio); +} + +/* + * Request sync osd read + */ +static int rbd_req_sync_read(struct rbd_device *dev, + struct ceph_snap_context *snapc, + u64 snapid, + const char *obj, + u64 ofs, u64 len, + char *buf) +{ + return rbd_req_sync_op(dev, NULL, + (snapid ? snapid : CEPH_NOSNAP), + CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, + NULL, + 1, obj, ofs, len, buf); +} + +/* + * Request sync osd read + */ +static int rbd_req_sync_rollback_obj(struct rbd_device *dev, + u64 snapid, + const char *obj) +{ + struct ceph_osd_req_op *ops; + int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0); + if (ret < 0) + return ret; + + ops[0].snap.snapid = snapid; + + ret = rbd_req_sync_op(dev, NULL, + CEPH_NOSNAP, + 0, + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, + ops, + 1, obj, 0, 0, NULL); + + rbd_destroy_ops(ops); + + if (ret < 0) + return ret; + + return ret; +} + +/* + * Request sync osd read + */ +static int rbd_req_sync_exec(struct rbd_device *dev, + const char *obj, + const char *cls, + const char *method, + const char *data, + int len) +{ + struct ceph_osd_req_op *ops; + int cls_len = strlen(cls); + int method_len = strlen(method); + int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL, + cls_len + method_len + len); + if (ret < 0) + return ret; + + ops[0].cls.class_name = cls; + ops[0].cls.class_len = (__u8)cls_len; + ops[0].cls.method_name = method; + ops[0].cls.method_len = (__u8)method_len; + ops[0].cls.argc = 0; + ops[0].cls.indata = data; + ops[0].cls.indata_len = len; + + ret = rbd_req_sync_op(dev, NULL, + CEPH_NOSNAP, + 0, + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, + ops, + 1, obj, 0, 0, NULL); + + rbd_destroy_ops(ops); + + dout("cls_exec returned %d\n", ret); + return ret; +} + +/* + * block device queue callback + */ +static void rbd_rq_fn(struct request_queue *q) +{ + struct rbd_device *rbd_dev = q->queuedata; + struct request *rq; + struct bio_pair *bp = NULL; + + rq = blk_fetch_request(q); + + while (1) { + struct bio *bio; + struct bio *rq_bio, *next_bio = NULL; + bool do_write; + int size, op_size = 0; + u64 ofs; + + /* peek at request from block layer */ + if (!rq) + break; + + dout("fetched request\n"); + + /* filter out block requests we don't understand */ + if ((rq->cmd_type != REQ_TYPE_FS)) { + __blk_end_request_all(rq, 0); + goto next; + } + + /* deduce our operation (read, write) */ + do_write = (rq_data_dir(rq) == WRITE); + + size = blk_rq_bytes(rq); + ofs = blk_rq_pos(rq) * 512ULL; + rq_bio = rq->bio; + if (do_write && rbd_dev->read_only) { + __blk_end_request_all(rq, -EROFS); + goto next; + } + + spin_unlock_irq(q->queue_lock); + + dout("%s 0x%x bytes at 0x%llx\n", + do_write ? "write" : "read", + size, blk_rq_pos(rq) * 512ULL); + + do { + /* a bio clone to be passed down to OSD req */ + dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt); + op_size = rbd_get_segment(&rbd_dev->header, + rbd_dev->header.block_name, + ofs, size, + NULL, NULL); + bio = bio_chain_clone(&rq_bio, &next_bio, &bp, + op_size, GFP_ATOMIC); + if (!bio) { + spin_lock_irq(q->queue_lock); + __blk_end_request_all(rq, -ENOMEM); + goto next; + } + + /* init OSD command: write or read */ + if (do_write) + rbd_req_write(rq, rbd_dev, + rbd_dev->header.snapc, + ofs, + op_size, bio); + else + rbd_req_read(rq, rbd_dev, + cur_snap_id(rbd_dev), + ofs, + op_size, bio); + + size -= op_size; + ofs += op_size; + + rq_bio = next_bio; + } while (size > 0); + + if (bp) + bio_pair_release(bp); + + spin_lock_irq(q->queue_lock); +next: + rq = blk_fetch_request(q); + } +} + +/* + * a queue callback. Makes sure that we don't create a bio that spans across + * multiple osd objects. One exception would be with a single page bios, + * which we handle later at bio_chain_clone + */ +static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, + struct bio_vec *bvec) +{ + struct rbd_device *rbd_dev = q->queuedata; + unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9); + sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev); + unsigned int bio_sectors = bmd->bi_size >> 9; + int max; + + max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + + bio_sectors)) << 9; + if (max < 0) + max = 0; /* bio_add cannot handle a negative return */ + if (max <= bvec->bv_len && bio_sectors == 0) + return bvec->bv_len; + return max; +} + +static void rbd_free_disk(struct rbd_device *rbd_dev) +{ + struct gendisk *disk = rbd_dev->disk; + + if (!disk) + return; + + rbd_header_free(&rbd_dev->header); + + if (disk->flags & GENHD_FL_UP) + del_gendisk(disk); + if (disk->queue) + blk_cleanup_queue(disk->queue); + put_disk(disk); +} + +/* + * reload the ondisk the header + */ +static int rbd_read_header(struct rbd_device *rbd_dev, + struct rbd_image_header *header) +{ + ssize_t rc; + struct rbd_image_header_ondisk *dh; + int snap_count = 0; + u64 snap_names_len = 0; + + while (1) { + int len = sizeof(*dh) + + snap_count * sizeof(struct rbd_image_snap_ondisk) + + snap_names_len; + + rc = -ENOMEM; + dh = kmalloc(len, GFP_KERNEL); + if (!dh) + return -ENOMEM; + + rc = rbd_req_sync_read(rbd_dev, + NULL, CEPH_NOSNAP, + rbd_dev->obj_md_name, + 0, len, + (char *)dh); + if (rc < 0) + goto out_dh; + + rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL); + if (rc < 0) + goto out_dh; + + if (snap_count != header->total_snaps) { + snap_count = header->total_snaps; + snap_names_len = header->snap_names_len; + rbd_header_free(header); + kfree(dh); + continue; + } + break; + } + +out_dh: + kfree(dh); + return rc; +} + +/* + * create a snapshot + */ +static int rbd_header_add_snap(struct rbd_device *dev, + const char *snap_name, + gfp_t gfp_flags) +{ + int name_len = strlen(snap_name); + u64 new_snapid; + int ret; + void *data, *data_start, *data_end; + + /* we should create a snapshot only if we're pointing at the head */ + if (dev->cur_snap) + return -EINVAL; + + ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid, + &new_snapid); + dout("created snapid=%lld\n", new_snapid); + if (ret < 0) + return ret; + + data = kmalloc(name_len + 16, gfp_flags); + if (!data) + return -ENOMEM; + + data_start = data; + data_end = data + name_len + 16; + + ceph_encode_string_safe(&data, data_end, snap_name, name_len, bad); + ceph_encode_64_safe(&data, data_end, new_snapid, bad); + + ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add", + data_start, data - data_start); + + kfree(data_start); + + if (ret < 0) + return ret; + + dev->header.snapc->seq = new_snapid; + + return 0; +bad: + return -ERANGE; +} + +/* + * only read the first part of the ondisk header, without the snaps info + */ +static int rbd_update_snaps(struct rbd_device *rbd_dev) +{ + int ret; + struct rbd_image_header h; + u64 snap_seq; + + ret = rbd_read_header(rbd_dev, &h); + if (ret < 0) + return ret; + + down_write(&rbd_dev->header.snap_rwsem); + + snap_seq = rbd_dev->header.snapc->seq; + + kfree(rbd_dev->header.snapc); + kfree(rbd_dev->header.snap_names); + kfree(rbd_dev->header.snap_sizes); + + rbd_dev->header.total_snaps = h.total_snaps; + rbd_dev->header.snapc = h.snapc; + rbd_dev->header.snap_names = h.snap_names; + rbd_dev->header.snap_sizes = h.snap_sizes; + rbd_dev->header.snapc->seq = snap_seq; + + up_write(&rbd_dev->header.snap_rwsem); + + return 0; +} + +static int rbd_init_disk(struct rbd_device *rbd_dev) +{ + struct gendisk *disk; + struct request_queue *q; + int rc; + u64 total_size = 0; + + /* contact OSD, request size info about the object being mapped */ + rc = rbd_read_header(rbd_dev, &rbd_dev->header); + if (rc) + return rc; + + rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size); + if (rc) + return rc; + + /* create gendisk info */ + rc = -ENOMEM; + disk = alloc_disk(RBD_MINORS_PER_MAJOR); + if (!disk) + goto out; + + sprintf(disk->disk_name, DRV_NAME "%d", rbd_dev->id); + disk->major = rbd_dev->major; + disk->first_minor = 0; + disk->fops = &rbd_bd_ops; + disk->private_data = rbd_dev; + + /* init rq */ + rc = -ENOMEM; + q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); + if (!q) + goto out_disk; + blk_queue_merge_bvec(q, rbd_merge_bvec); + disk->queue = q; + + q->queuedata = rbd_dev; + + rbd_dev->disk = disk; + rbd_dev->q = q; + + /* finally, announce the disk to the world */ + set_capacity(disk, total_size / 512ULL); + add_disk(disk); + + pr_info("%s: added with size 0x%llx\n", + disk->disk_name, (unsigned long long)total_size); + return 0; + +out_disk: + put_disk(disk); +out: + return rc; +} + +/******************************************************************** + * /sys/class/rbd/ + * add map rados objects to blkdev + * remove unmap rados objects + * list show mappings + *******************************************************************/ + +static void class_rbd_release(struct class *cls) +{ + kfree(cls); +} + +static ssize_t class_rbd_list(struct class *c, + struct class_attribute *attr, + char *data) +{ + int n = 0; + struct list_head *tmp; + int max = PAGE_SIZE; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + n += snprintf(data, max, + "#id\tmajor\tclient_name\tpool\tname\tsnap\tKB\n"); + + list_for_each(tmp, &rbd_dev_list) { + struct rbd_device *rbd_dev; + + rbd_dev = list_entry(tmp, struct rbd_device, node); + n += snprintf(data+n, max-n, + "%d\t%d\tclient%lld\t%s\t%s\t%s\t%lld\n", + rbd_dev->id, + rbd_dev->major, + ceph_client_id(rbd_dev->client), + rbd_dev->pool_name, + rbd_dev->obj, rbd_dev->snap_name, + rbd_dev->header.image_size >> 10); + if (n == max) + break; + } + + mutex_unlock(&ctl_mutex); + return n; +} + +static ssize_t class_rbd_add(struct class *c, + struct class_attribute *attr, + const char *buf, size_t count) +{ + struct ceph_osd_client *osdc; + struct rbd_device *rbd_dev; + ssize_t rc = -ENOMEM; + int irc, new_id = 0; + struct list_head *tmp; + char *mon_dev_name; + char *options; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL); + if (!mon_dev_name) + goto err_out_mod; + + options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL); + if (!options) + goto err_mon_dev; + + /* new rbd_device object */ + rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); + if (!rbd_dev) + goto err_out_opt; + + /* static rbd_device initialization */ + spin_lock_init(&rbd_dev->lock); + INIT_LIST_HEAD(&rbd_dev->node); + + /* generate unique id: find highest unique id, add one */ + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + list_for_each(tmp, &rbd_dev_list) { + struct rbd_device *rbd_dev; + + rbd_dev = list_entry(tmp, struct rbd_device, node); + if (rbd_dev->id >= new_id) + new_id = rbd_dev->id + 1; + } + + rbd_dev->id = new_id; + + /* add to global list */ + list_add_tail(&rbd_dev->node, &rbd_dev_list); + + /* parse add command */ + if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s " + "%" __stringify(RBD_MAX_OPT_LEN) "s " + "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s " + "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s" + "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s", + mon_dev_name, options, rbd_dev->pool_name, + rbd_dev->obj, rbd_dev->snap_name) < 4) { + rc = -EINVAL; + goto err_out_slot; + } + + if (rbd_dev->snap_name[0] == 0) + rbd_dev->snap_name[0] = '-'; + + rbd_dev->obj_len = strlen(rbd_dev->obj); + snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s", + rbd_dev->obj, RBD_SUFFIX); + + /* initialize rest of new object */ + snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id); + rc = rbd_get_client(rbd_dev, mon_dev_name, options); + if (rc < 0) + goto err_out_slot; + + mutex_unlock(&ctl_mutex); + + /* pick the pool */ + osdc = &rbd_dev->client->osdc; + rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); + if (rc < 0) + goto err_out_client; + rbd_dev->poolid = rc; + + /* register our block device */ + irc = register_blkdev(0, rbd_dev->name); + if (irc < 0) { + rc = irc; + goto err_out_client; + } + rbd_dev->major = irc; + + /* set up and announce blkdev mapping */ + rc = rbd_init_disk(rbd_dev); + if (rc) + goto err_out_blkdev; + + return count; + +err_out_blkdev: + unregister_blkdev(rbd_dev->major, rbd_dev->name); +err_out_client: + rbd_put_client(rbd_dev); + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); +err_out_slot: + list_del_init(&rbd_dev->node); + mutex_unlock(&ctl_mutex); + + kfree(rbd_dev); +err_out_opt: + kfree(options); +err_mon_dev: + kfree(mon_dev_name); +err_out_mod: + dout("Error adding device %s\n", buf); + module_put(THIS_MODULE); + return rc; +} + +static struct rbd_device *__rbd_get_dev(unsigned long id) +{ + struct list_head *tmp; + struct rbd_device *rbd_dev; + + list_for_each(tmp, &rbd_dev_list) { + rbd_dev = list_entry(tmp, struct rbd_device, node); + if (rbd_dev->id == id) + return rbd_dev; + } + return NULL; +} + +static ssize_t class_rbd_remove(struct class *c, + struct class_attribute *attr, + const char *buf, + size_t count) +{ + struct rbd_device *rbd_dev = NULL; + int target_id, rc; + unsigned long ul; + + rc = strict_strtoul(buf, 10, &ul); + if (rc) + return rc; + + /* convert to int; abort if we lost anything in the conversion */ + target_id = (int) ul; + if (target_id != ul) + return -EINVAL; + + /* remove object from list immediately */ + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + rbd_dev = __rbd_get_dev(target_id); + if (rbd_dev) + list_del_init(&rbd_dev->node); + + mutex_unlock(&ctl_mutex); + + if (!rbd_dev) + return -ENOENT; + + rbd_put_client(rbd_dev); + + /* clean up and free blkdev */ + rbd_free_disk(rbd_dev); + unregister_blkdev(rbd_dev->major, rbd_dev->name); + kfree(rbd_dev); + + /* release module ref */ + module_put(THIS_MODULE); + + return count; +} + +static ssize_t class_rbd_snaps_list(struct class *c, + struct class_attribute *attr, + char *data) +{ + struct rbd_device *rbd_dev = NULL; + struct list_head *tmp; + struct rbd_image_header *header; + int i, n = 0, max = PAGE_SIZE; + int ret; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + n += snprintf(data, max, "#id\tsnap\tKB\n"); + + list_for_each(tmp, &rbd_dev_list) { + char *names, *p; + struct ceph_snap_context *snapc; + + rbd_dev = list_entry(tmp, struct rbd_device, node); + header = &rbd_dev->header; + + down_read(&header->snap_rwsem); + + names = header->snap_names; + snapc = header->snapc; + + n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n", + rbd_dev->id, RBD_SNAP_HEAD_NAME, + header->image_size >> 10, + (!rbd_dev->cur_snap ? " (*)" : "")); + if (n == max) + break; + + p = names; + for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) { + n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n", + rbd_dev->id, p, header->snap_sizes[i] >> 10, + (rbd_dev->cur_snap && + (snap_index(header, i) == rbd_dev->cur_snap) ? + " (*)" : "")); + if (n == max) + break; + } + + up_read(&header->snap_rwsem); + } + + + ret = n; + mutex_unlock(&ctl_mutex); + return ret; +} + +static ssize_t class_rbd_snaps_refresh(struct class *c, + struct class_attribute *attr, + const char *buf, + size_t count) +{ + struct rbd_device *rbd_dev = NULL; + int target_id, rc; + unsigned long ul; + int ret = count; + + rc = strict_strtoul(buf, 10, &ul); + if (rc) + return rc; + + /* convert to int; abort if we lost anything in the conversion */ + target_id = (int) ul; + if (target_id != ul) + return -EINVAL; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + rbd_dev = __rbd_get_dev(target_id); + if (!rbd_dev) { + ret = -ENOENT; + goto done; + } + + rc = rbd_update_snaps(rbd_dev); + if (rc < 0) + ret = rc; + +done: + mutex_unlock(&ctl_mutex); + return ret; +} + +static ssize_t class_rbd_snap_create(struct class *c, + struct class_attribute *attr, + const char *buf, + size_t count) +{ + struct rbd_device *rbd_dev = NULL; + int target_id, ret; + char *name; + + name = kmalloc(RBD_MAX_SNAP_NAME_LEN + 1, GFP_KERNEL); + if (!name) + return -ENOMEM; + + /* parse snaps add command */ + if (sscanf(buf, "%d " + "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s", + &target_id, + name) != 2) { + ret = -EINVAL; + goto done; + } + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + rbd_dev = __rbd_get_dev(target_id); + if (!rbd_dev) { + ret = -ENOENT; + goto done_unlock; + } + + ret = rbd_header_add_snap(rbd_dev, + name, GFP_KERNEL); + if (ret < 0) + goto done_unlock; + + ret = rbd_update_snaps(rbd_dev); + if (ret < 0) + goto done_unlock; + + ret = count; +done_unlock: + mutex_unlock(&ctl_mutex); +done: + kfree(name); + return ret; +} + +static ssize_t class_rbd_rollback(struct class *c, + struct class_attribute *attr, + const char *buf, + size_t count) +{ + struct rbd_device *rbd_dev = NULL; + int target_id, ret; + u64 snapid; + char snap_name[RBD_MAX_SNAP_NAME_LEN]; + u64 cur_ofs; + char *seg_name; + + /* parse snaps add command */ + if (sscanf(buf, "%d " + "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s", + &target_id, + snap_name) != 2) { + return -EINVAL; + } + + ret = -ENOMEM; + seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); + if (!seg_name) + return ret; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + rbd_dev = __rbd_get_dev(target_id); + if (!rbd_dev) { + ret = -ENOENT; + goto done_unlock; + } + + ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL); + if (ret < 0) + goto done_unlock; + + dout("snapid=%lld\n", snapid); + + cur_ofs = 0; + while (cur_ofs < rbd_dev->header.image_size) { + cur_ofs += rbd_get_segment(&rbd_dev->header, + rbd_dev->obj, + cur_ofs, (u64)-1, + seg_name, NULL); + dout("seg_name=%s\n", seg_name); + + ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name); + if (ret < 0) + pr_warning("could not roll back obj %s err=%d\n", + seg_name, ret); + } + + ret = rbd_update_snaps(rbd_dev); + if (ret < 0) + goto done_unlock; + + ret = count; + +done_unlock: + mutex_unlock(&ctl_mutex); + kfree(seg_name); + + return ret; +} + +static struct class_attribute class_rbd_attrs[] = { + __ATTR(add, 0200, NULL, class_rbd_add), + __ATTR(remove, 0200, NULL, class_rbd_remove), + __ATTR(list, 0444, class_rbd_list, NULL), + __ATTR(snaps_refresh, 0200, NULL, class_rbd_snaps_refresh), + __ATTR(snap_create, 0200, NULL, class_rbd_snap_create), + __ATTR(snaps_list, 0444, class_rbd_snaps_list, NULL), + __ATTR(snap_rollback, 0200, NULL, class_rbd_rollback), + __ATTR_NULL +}; + +/* + * create control files in sysfs + * /sys/class/rbd/... + */ +static int rbd_sysfs_init(void) +{ + int ret = -ENOMEM; + + class_rbd = kzalloc(sizeof(*class_rbd), GFP_KERNEL); + if (!class_rbd) + goto out; + + class_rbd->name = DRV_NAME; + class_rbd->owner = THIS_MODULE; + class_rbd->class_release = class_rbd_release; + class_rbd->class_attrs = class_rbd_attrs; + + ret = class_register(class_rbd); + if (ret) + goto out_class; + return 0; + +out_class: + kfree(class_rbd); + class_rbd = NULL; + pr_err(DRV_NAME ": failed to create class rbd\n"); +out: + return ret; +} + +static void rbd_sysfs_cleanup(void) +{ + if (class_rbd) + class_destroy(class_rbd); + class_rbd = NULL; +} + +int __init rbd_init(void) +{ + int rc; + + rc = rbd_sysfs_init(); + if (rc) + return rc; + spin_lock_init(&node_lock); + pr_info("loaded " DRV_NAME_LONG "\n"); + return 0; +} + +void __exit rbd_exit(void) +{ + rbd_sysfs_cleanup(); +} + +module_init(rbd_init); +module_exit(rbd_exit); + +MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); +MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); +MODULE_DESCRIPTION("rados block device"); + +/* following authorship retained from original osdblk.c */ +MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); + +MODULE_LICENSE("GPL"); diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h new file mode 100644 index 000000000000..fc6c678aa2cb --- /dev/null +++ b/drivers/block/rbd_types.h @@ -0,0 +1,73 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_RBD_TYPES_H +#define CEPH_RBD_TYPES_H + +#include <linux/types.h> + +/* + * rbd image 'foo' consists of objects + * foo.rbd - image metadata + * foo.00000000 + * foo.00000001 + * ... - data + */ + +#define RBD_SUFFIX ".rbd" +#define RBD_DIRECTORY "rbd_directory" +#define RBD_INFO "rbd_info" + +#define RBD_DEFAULT_OBJ_ORDER 22 /* 4MB */ +#define RBD_MIN_OBJ_ORDER 16 +#define RBD_MAX_OBJ_ORDER 30 + +#define RBD_MAX_OBJ_NAME_LEN 96 +#define RBD_MAX_SEG_NAME_LEN 128 + +#define RBD_COMP_NONE 0 +#define RBD_CRYPT_NONE 0 + +#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n" +#define RBD_HEADER_SIGNATURE "RBD" +#define RBD_HEADER_VERSION "001.005" + +struct rbd_info { + __le64 max_id; +} __attribute__ ((packed)); + +struct rbd_image_snap_ondisk { + __le64 id; + __le64 image_size; +} __attribute__((packed)); + +struct rbd_image_header_ondisk { + char text[40]; + char block_name[24]; + char signature[4]; + char version[8]; + struct { + __u8 order; + __u8 crypt_type; + __u8 comp_type; + __u8 unused; + } __attribute__((packed)) options; + __le64 image_size; + __le64 snap_seq; + __le32 snap_count; + __le32 reserved; + __le64 snap_names_len; + struct rbd_image_snap_ondisk snaps[0]; +} __attribute__((packed)); + + +#endif diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 2aafafca2b13..8320490226b7 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -2,7 +2,6 @@ #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/blkdev.h> -#include <linux/smp_lock.h> #include <linux/hdreg.h> #include <linux/virtio.h> #include <linux/virtio_blk.h> @@ -202,6 +201,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) struct virtio_blk *vblk = disk->private_data; struct request *req; struct bio *bio; + int err; bio = bio_map_kern(vblk->disk->queue, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL); @@ -215,11 +215,14 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) } req->cmd_type = REQ_TYPE_SPECIAL; - return blk_execute_rq(vblk->disk->queue, vblk->disk, req, false); + err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false); + blk_put_request(req); + + return err; } -static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long data) +static int virtblk_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long data) { struct gendisk *disk = bdev->bd_disk; struct virtio_blk *vblk = disk->private_data; @@ -234,18 +237,6 @@ static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode, (void __user *)data); } -static int virtblk_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long param) -{ - int ret; - - lock_kernel(); - ret = virtblk_locked_ioctl(bdev, mode, cmd, param); - unlock_kernel(); - - return ret; -} - /* We provide getgeo only to please some old bootloader/partitioning tools */ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) { diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index 70312da4c968..564808a5c3c0 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -199,7 +199,7 @@ static void amd64_cleanup(void) struct pci_dev *dev = k8_northbridges[i]; /* disable gart translation */ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp); - tmp &= ~AMD64_GARTEN; + tmp &= ~GARTEN; pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, tmp); } } @@ -313,7 +313,7 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp, if (order < 0 || !agp_aperture_valid(aper, (32*1024*1024)<<order)) return -1; - pci_write_config_dword(nb, AMD64_GARTAPERTURECTL, order << 1); + gart_set_size_and_enable(nb, order); pci_write_config_dword(nb, AMD64_GARTAPERTUREBASE, aper >> 25); return 0; diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c index d2abf5143983..64255cef8a7d 100644 --- a/drivers/char/agp/generic.c +++ b/drivers/char/agp/generic.c @@ -984,7 +984,9 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge) bridge->driver->cache_flush(); #ifdef CONFIG_X86 - set_memory_uc((unsigned long)table, 1 << page_order); + if (set_memory_uc((unsigned long)table, 1 << page_order)) + printk(KERN_WARNING "Could not set GATT table memory to UC!"); + bridge->gatt_table = (void *)table; #else bridge->gatt_table = ioremap_nocache(virt_to_phys(table), diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c index 05ad4a17a28f..7c4133582dba 100644 --- a/drivers/char/tpm/tpm.c +++ b/drivers/char/tpm/tpm.c @@ -47,6 +47,16 @@ enum tpm_duration { #define TPM_MAX_PROTECTED_ORDINAL 12 #define TPM_PROTECTED_ORDINAL_MASK 0xFF +/* + * Bug workaround - some TPM's don't flush the most + * recently changed pcr on suspend, so force the flush + * with an extend to the selected _unused_ non-volatile pcr. + */ +static int tpm_suspend_pcr; +module_param_named(suspend_pcr, tpm_suspend_pcr, uint, 0644); +MODULE_PARM_DESC(suspend_pcr, + "PCR to use for dummy writes to faciltate flush on suspend."); + static LIST_HEAD(tpm_chip_list); static DEFINE_SPINLOCK(driver_lock); static DECLARE_BITMAP(dev_mask, TPM_NUM_DEVICES); @@ -1077,18 +1087,6 @@ static struct tpm_input_header savestate_header = { .ordinal = TPM_ORD_SAVESTATE }; -/* Bug workaround - some TPM's don't flush the most - * recently changed pcr on suspend, so force the flush - * with an extend to the selected _unused_ non-volatile pcr. - */ -static int tpm_suspend_pcr; -static int __init tpm_suspend_setup(char *str) -{ - get_option(&str, &tpm_suspend_pcr); - return 1; -} -__setup("tpm_suspend_pcr=", tpm_suspend_setup); - /* * We are about to suspend. Save the TPM state * so that it can be restored. diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index c810481a5bc2..6c1b676643a9 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -48,6 +48,9 @@ struct ports_driver_data { /* Used for exporting per-port information to debugfs */ struct dentry *debugfs_dir; + /* List of all the devices we're handling */ + struct list_head portdevs; + /* Number of devices this driver is handling */ unsigned int index; @@ -108,6 +111,9 @@ struct port_buffer { * ports for that device (vdev->priv). */ struct ports_device { + /* Next portdev in the list, head is in the pdrvdata struct */ + struct list_head list; + /* * Workqueue handlers where we process deferred work after * notification @@ -178,15 +184,21 @@ struct port { struct console cons; /* Each port associates with a separate char device */ - struct cdev cdev; + struct cdev *cdev; struct device *dev; + /* Reference-counting to handle port hot-unplugs and file operations */ + struct kref kref; + /* A waitqueue for poll() or blocking read operations */ wait_queue_head_t waitqueue; /* The 'name' of the port that we expose via sysfs properties */ char *name; + /* We can notify apps of host connect / disconnect events via SIGIO */ + struct fasync_struct *async_queue; + /* The 'id' to identify the port with the Host */ u32 id; @@ -221,6 +233,41 @@ out: return port; } +static struct port *find_port_by_devt_in_portdev(struct ports_device *portdev, + dev_t dev) +{ + struct port *port; + unsigned long flags; + + spin_lock_irqsave(&portdev->ports_lock, flags); + list_for_each_entry(port, &portdev->ports, list) + if (port->cdev->dev == dev) + goto out; + port = NULL; +out: + spin_unlock_irqrestore(&portdev->ports_lock, flags); + + return port; +} + +static struct port *find_port_by_devt(dev_t dev) +{ + struct ports_device *portdev; + struct port *port; + unsigned long flags; + + spin_lock_irqsave(&pdrvdata_lock, flags); + list_for_each_entry(portdev, &pdrvdata.portdevs, list) { + port = find_port_by_devt_in_portdev(portdev, dev); + if (port) + goto out; + } + port = NULL; +out: + spin_unlock_irqrestore(&pdrvdata_lock, flags); + return port; +} + static struct port *find_port_by_id(struct ports_device *portdev, u32 id) { struct port *port; @@ -410,7 +457,10 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id, static ssize_t send_control_msg(struct port *port, unsigned int event, unsigned int value) { - return __send_control_msg(port->portdev, port->id, event, value); + /* Did the port get unplugged before userspace closed it? */ + if (port->portdev) + return __send_control_msg(port->portdev, port->id, event, value); + return 0; } /* Callers must take the port->outvq_lock */ @@ -459,9 +509,12 @@ static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count, /* * Wait till the host acknowledges it pushed out the data we - * sent. This is done for ports in blocking mode or for data - * from the hvc_console; the tty operations are performed with - * spinlocks held so we can't sleep here. + * sent. This is done for data from the hvc_console; the tty + * operations are performed with spinlocks held so we can't + * sleep here. An alternative would be to copy the data to a + * buffer and relax the spinning requirement. The downside is + * we need to kmalloc a GFP_ATOMIC buffer each time the + * console driver writes something out. */ while (!virtqueue_get_buf(out_vq, &len)) cpu_relax(); @@ -522,6 +575,10 @@ static ssize_t fill_readbuf(struct port *port, char *out_buf, size_t out_count, /* The condition that must be true for polling to end */ static bool will_read_block(struct port *port) { + if (!port->guest_connected) { + /* Port got hot-unplugged. Let's exit. */ + return false; + } return !port_has_data(port) && port->host_connected; } @@ -572,6 +629,9 @@ static ssize_t port_fops_read(struct file *filp, char __user *ubuf, if (ret < 0) return ret; } + /* Port got hot-unplugged. */ + if (!port->guest_connected) + return -ENODEV; /* * We could've received a disconnection message while we were * waiting for more data. @@ -613,6 +673,9 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; } + /* Port got hot-unplugged. */ + if (!port->guest_connected) + return -ENODEV; count = min((size_t)(32 * 1024), count); @@ -626,6 +689,14 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf, goto free_buf; } + /* + * We now ask send_buf() to not spin for generic ports -- we + * can re-use the same code path that non-blocking file + * descriptors take for blocking file descriptors since the + * wait is already done and we're certain the write will go + * through to the host. + */ + nonblock = true; ret = send_buf(port, buf, count, nonblock); if (nonblock && ret > 0) @@ -645,6 +716,10 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait) port = filp->private_data; poll_wait(filp, &port->waitqueue, wait); + if (!port->guest_connected) { + /* Port got unplugged */ + return POLLHUP; + } ret = 0; if (!will_read_block(port)) ret |= POLLIN | POLLRDNORM; @@ -656,6 +731,8 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait) return ret; } +static void remove_port(struct kref *kref); + static int port_fops_release(struct inode *inode, struct file *filp) { struct port *port; @@ -676,6 +753,16 @@ static int port_fops_release(struct inode *inode, struct file *filp) reclaim_consumed_buffers(port); spin_unlock_irq(&port->outvq_lock); + /* + * Locks aren't necessary here as a port can't be opened after + * unplug, and if a port isn't unplugged, a kref would already + * exist for the port. Plus, taking ports_lock here would + * create a dependency on other locks taken by functions + * inside remove_port if we're the last holder of the port, + * creating many problems. + */ + kref_put(&port->kref, remove_port); + return 0; } @@ -683,22 +770,31 @@ static int port_fops_open(struct inode *inode, struct file *filp) { struct cdev *cdev = inode->i_cdev; struct port *port; + int ret; - port = container_of(cdev, struct port, cdev); + port = find_port_by_devt(cdev->dev); filp->private_data = port; + /* Prevent against a port getting hot-unplugged at the same time */ + spin_lock_irq(&port->portdev->ports_lock); + kref_get(&port->kref); + spin_unlock_irq(&port->portdev->ports_lock); + /* * Don't allow opening of console port devices -- that's done * via /dev/hvc */ - if (is_console_port(port)) - return -ENXIO; + if (is_console_port(port)) { + ret = -ENXIO; + goto out; + } /* Allow only one process to open a particular port at a time */ spin_lock_irq(&port->inbuf_lock); if (port->guest_connected) { spin_unlock_irq(&port->inbuf_lock); - return -EMFILE; + ret = -EMFILE; + goto out; } port->guest_connected = true; @@ -713,10 +809,23 @@ static int port_fops_open(struct inode *inode, struct file *filp) reclaim_consumed_buffers(port); spin_unlock_irq(&port->outvq_lock); + nonseekable_open(inode, filp); + /* Notify host of port being opened */ send_control_msg(filp->private_data, VIRTIO_CONSOLE_PORT_OPEN, 1); return 0; +out: + kref_put(&port->kref, remove_port); + return ret; +} + +static int port_fops_fasync(int fd, struct file *filp, int mode) +{ + struct port *port; + + port = filp->private_data; + return fasync_helper(fd, filp, mode, &port->async_queue); } /* @@ -732,6 +841,8 @@ static const struct file_operations port_fops = { .write = port_fops_write, .poll = port_fops_poll, .release = port_fops_release, + .fasync = port_fops_fasync, + .llseek = no_llseek, }; /* @@ -990,6 +1101,12 @@ static unsigned int fill_queue(struct virtqueue *vq, spinlock_t *lock) return nr_added_bufs; } +static void send_sigio_to_port(struct port *port) +{ + if (port->async_queue && port->guest_connected) + kill_fasync(&port->async_queue, SIGIO, POLL_OUT); +} + static int add_port(struct ports_device *portdev, u32 id) { char debugfs_name[16]; @@ -1004,6 +1121,7 @@ static int add_port(struct ports_device *portdev, u32 id) err = -ENOMEM; goto fail; } + kref_init(&port->kref); port->portdev = portdev; port->id = id; @@ -1011,6 +1129,7 @@ static int add_port(struct ports_device *portdev, u32 id) port->name = NULL; port->inbuf = NULL; port->cons.hvc = NULL; + port->async_queue = NULL; port->cons.ws.ws_row = port->cons.ws.ws_col = 0; @@ -1021,14 +1140,20 @@ static int add_port(struct ports_device *portdev, u32 id) port->in_vq = portdev->in_vqs[port->id]; port->out_vq = portdev->out_vqs[port->id]; - cdev_init(&port->cdev, &port_fops); + port->cdev = cdev_alloc(); + if (!port->cdev) { + dev_err(&port->portdev->vdev->dev, "Error allocating cdev\n"); + err = -ENOMEM; + goto free_port; + } + port->cdev->ops = &port_fops; devt = MKDEV(portdev->chr_major, id); - err = cdev_add(&port->cdev, devt, 1); + err = cdev_add(port->cdev, devt, 1); if (err < 0) { dev_err(&port->portdev->vdev->dev, "Error %d adding cdev for port %u\n", err, id); - goto free_port; + goto free_cdev; } port->dev = device_create(pdrvdata.class, &port->portdev->vdev->dev, devt, port, "vport%up%u", @@ -1093,7 +1218,7 @@ free_inbufs: free_device: device_destroy(pdrvdata.class, port->dev->devt); free_cdev: - cdev_del(&port->cdev); + cdev_del(port->cdev); free_port: kfree(port); fail: @@ -1102,21 +1227,45 @@ fail: return err; } -/* Remove all port-specific data. */ -static int remove_port(struct port *port) +/* No users remain, remove all port-specific data. */ +static void remove_port(struct kref *kref) +{ + struct port *port; + + port = container_of(kref, struct port, kref); + + sysfs_remove_group(&port->dev->kobj, &port_attribute_group); + device_destroy(pdrvdata.class, port->dev->devt); + cdev_del(port->cdev); + + kfree(port->name); + + debugfs_remove(port->debugfs_file); + + kfree(port); +} + +/* + * Port got unplugged. Remove port from portdev's list and drop the + * kref reference. If no userspace has this port opened, it will + * result in immediate removal the port. + */ +static void unplug_port(struct port *port) { struct port_buffer *buf; + spin_lock_irq(&port->portdev->ports_lock); + list_del(&port->list); + spin_unlock_irq(&port->portdev->ports_lock); + if (port->guest_connected) { port->guest_connected = false; port->host_connected = false; wake_up_interruptible(&port->waitqueue); - send_control_msg(port, VIRTIO_CONSOLE_PORT_OPEN, 0); - } - spin_lock_irq(&port->portdev->ports_lock); - list_del(&port->list); - spin_unlock_irq(&port->portdev->ports_lock); + /* Let the app know the port is going down. */ + send_sigio_to_port(port); + } if (is_console_port(port)) { spin_lock_irq(&pdrvdata_lock); @@ -1135,9 +1284,6 @@ static int remove_port(struct port *port) hvc_remove(port->cons.hvc); #endif } - sysfs_remove_group(&port->dev->kobj, &port_attribute_group); - device_destroy(pdrvdata.class, port->dev->devt); - cdev_del(&port->cdev); /* Remove unused data this port might have received. */ discard_port_data(port); @@ -1148,12 +1294,19 @@ static int remove_port(struct port *port) while ((buf = virtqueue_detach_unused_buf(port->in_vq))) free_buf(buf); - kfree(port->name); - - debugfs_remove(port->debugfs_file); + /* + * We should just assume the device itself has gone off -- + * else a close on an open port later will try to send out a + * control message. + */ + port->portdev = NULL; - kfree(port); - return 0; + /* + * Locks around here are not necessary - a port can't be + * opened after we removed the port struct from ports_list + * above. + */ + kref_put(&port->kref, remove_port); } /* Any private messages that the Host and Guest want to share */ @@ -1192,7 +1345,7 @@ static void handle_control_message(struct ports_device *portdev, add_port(portdev, cpkt->id); break; case VIRTIO_CONSOLE_PORT_REMOVE: - remove_port(port); + unplug_port(port); break; case VIRTIO_CONSOLE_CONSOLE_PORT: if (!cpkt->value) @@ -1234,6 +1387,12 @@ static void handle_control_message(struct ports_device *portdev, spin_lock_irq(&port->outvq_lock); reclaim_consumed_buffers(port); spin_unlock_irq(&port->outvq_lock); + + /* + * If the guest is connected, it'll be interested in + * knowing the host connection state changed. + */ + send_sigio_to_port(port); break; case VIRTIO_CONSOLE_PORT_NAME: /* @@ -1330,6 +1489,9 @@ static void in_intr(struct virtqueue *vq) wake_up_interruptible(&port->waitqueue); + /* Send a SIGIO indicating new data in case the process asked for it */ + send_sigio_to_port(port); + if (is_console_port(port) && hvc_poll(port->cons.hvc)) hvc_kick(); } @@ -1566,6 +1728,10 @@ static int __devinit virtcons_probe(struct virtio_device *vdev) add_port(portdev, 0); } + spin_lock_irq(&pdrvdata_lock); + list_add_tail(&portdev->list, &pdrvdata.portdevs); + spin_unlock_irq(&pdrvdata_lock); + __send_control_msg(portdev, VIRTIO_CONSOLE_BAD_ID, VIRTIO_CONSOLE_DEVICE_READY, 1); return 0; @@ -1589,23 +1755,41 @@ static void virtcons_remove(struct virtio_device *vdev) { struct ports_device *portdev; struct port *port, *port2; - struct port_buffer *buf; - unsigned int len; portdev = vdev->priv; + spin_lock_irq(&pdrvdata_lock); + list_del(&portdev->list); + spin_unlock_irq(&pdrvdata_lock); + + /* Disable interrupts for vqs */ + vdev->config->reset(vdev); + /* Finish up work that's lined up */ cancel_work_sync(&portdev->control_work); list_for_each_entry_safe(port, port2, &portdev->ports, list) - remove_port(port); + unplug_port(port); unregister_chrdev(portdev->chr_major, "virtio-portsdev"); - while ((buf = virtqueue_get_buf(portdev->c_ivq, &len))) - free_buf(buf); + /* + * When yanking out a device, we immediately lose the + * (device-side) queues. So there's no point in keeping the + * guest side around till we drop our final reference. This + * also means that any ports which are in an open state will + * have to just stop using the port, as the vqs are going + * away. + */ + if (use_multiport(portdev)) { + struct port_buffer *buf; + unsigned int len; - while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq))) - free_buf(buf); + while ((buf = virtqueue_get_buf(portdev->c_ivq, &len))) + free_buf(buf); + + while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq))) + free_buf(buf); + } vdev->config->del_vqs(vdev); kfree(portdev->in_vqs); @@ -1652,6 +1836,7 @@ static int __init init(void) PTR_ERR(pdrvdata.debugfs_dir)); } INIT_LIST_HEAD(&pdrvdata.consoles); + INIT_LIST_HEAD(&pdrvdata.portdevs); return register_virtio_driver(&virtio_console); } diff --git a/drivers/dma/ioat/dma_v2.c b/drivers/dma/ioat/dma_v2.c index 216f9d383b5b..effd140fc042 100644 --- a/drivers/dma/ioat/dma_v2.c +++ b/drivers/dma/ioat/dma_v2.c @@ -879,7 +879,7 @@ int __devinit ioat2_dma_probe(struct ioatdma_device *device, int dca) dma->device_issue_pending = ioat2_issue_pending; dma->device_alloc_chan_resources = ioat2_alloc_chan_resources; dma->device_free_chan_resources = ioat2_free_chan_resources; - dma->device_tx_status = ioat_tx_status; + dma->device_tx_status = ioat_dma_tx_status; err = ioat_probe(device); if (err) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 1b05896648bc..9dcb17d51aee 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -2840,7 +2840,7 @@ static int __devinit pci_probe(struct pci_dev *dev, const struct pci_device_id *ent) { struct fw_ohci *ohci; - u32 bus_options, max_receive, link_speed, version, link_enh; + u32 bus_options, max_receive, link_speed, version; u64 guid; int i, err, n_ir, n_it; size_t size; @@ -2894,23 +2894,6 @@ static int __devinit pci_probe(struct pci_dev *dev, if (param_quirks) ohci->quirks = param_quirks; - /* TI OHCI-Lynx and compatible: set recommended configuration bits. */ - if (dev->vendor == PCI_VENDOR_ID_TI) { - pci_read_config_dword(dev, PCI_CFG_TI_LinkEnh, &link_enh); - - /* adjust latency of ATx FIFO: use 1.7 KB threshold */ - link_enh &= ~TI_LinkEnh_atx_thresh_mask; - link_enh |= TI_LinkEnh_atx_thresh_1_7K; - - /* use priority arbitration for asynchronous responses */ - link_enh |= TI_LinkEnh_enab_unfair; - - /* required for aPhyEnhanceEnable to work */ - link_enh |= TI_LinkEnh_enab_accel; - - pci_write_config_dword(dev, PCI_CFG_TI_LinkEnh, link_enh); - } - ar_context_init(&ohci->ar_request_ctx, ohci, OHCI1394_AsReqRcvContextControlSet); diff --git a/drivers/firewire/ohci.h b/drivers/firewire/ohci.h index 0e6c5a466908..ef5e7336da68 100644 --- a/drivers/firewire/ohci.h +++ b/drivers/firewire/ohci.h @@ -155,12 +155,4 @@ #define OHCI1394_phy_tcode 0xe -/* TI extensions */ - -#define PCI_CFG_TI_LinkEnh 0xf4 -#define TI_LinkEnh_enab_accel 0x00000002 -#define TI_LinkEnh_enab_unfair 0x00000080 -#define TI_LinkEnh_atx_thresh_mask 0x00003000 -#define TI_LinkEnh_atx_thresh_1_7K 0x00001000 - #endif /* _FIREWIRE_OHCI_H */ diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c index c74e4e8006d4..2dd2c93ebfa3 100644 --- a/drivers/gpu/drm/i915/i915_dma.c +++ b/drivers/gpu/drm/i915/i915_dma.c @@ -2231,6 +2231,9 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags) dev_priv->mchdev_lock = &mchdev_lock; spin_unlock(&mchdev_lock); + /* XXX Prevent module unload due to memory corruption bugs. */ + __module_get(THIS_MODULE); + return 0; out_workqueue_free: diff --git a/drivers/gpu/drm/i915/intel_fb.c b/drivers/gpu/drm/i915/intel_fb.c index 56ad9df2ccb5..b61966c126d3 100644 --- a/drivers/gpu/drm/i915/intel_fb.c +++ b/drivers/gpu/drm/i915/intel_fb.c @@ -238,8 +238,8 @@ int intel_fbdev_destroy(struct drm_device *dev, drm_framebuffer_cleanup(&ifb->base); if (ifb->obj) { - drm_gem_object_handle_unreference(ifb->obj); drm_gem_object_unreference(ifb->obj); + ifb->obj = NULL; } return 0; diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.c b/drivers/gpu/drm/nouveau/nouveau_fbcon.c index d2047713dc59..dbd30b2e43fd 100644 --- a/drivers/gpu/drm/nouveau/nouveau_fbcon.c +++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.c @@ -352,7 +352,6 @@ nouveau_fbcon_destroy(struct drm_device *dev, struct nouveau_fbdev *nfbdev) if (nouveau_fb->nvbo) { nouveau_bo_unmap(nouveau_fb->nvbo); - drm_gem_object_handle_unreference_unlocked(nouveau_fb->nvbo->gem); drm_gem_object_unreference_unlocked(nouveau_fb->nvbo->gem); nouveau_fb->nvbo = NULL; } diff --git a/drivers/gpu/drm/nouveau/nouveau_notifier.c b/drivers/gpu/drm/nouveau/nouveau_notifier.c index 3c9964a8fbad..3ec181ff50ce 100644 --- a/drivers/gpu/drm/nouveau/nouveau_notifier.c +++ b/drivers/gpu/drm/nouveau/nouveau_notifier.c @@ -79,7 +79,6 @@ nouveau_notifier_takedown_channel(struct nouveau_channel *chan) mutex_lock(&dev->struct_mutex); nouveau_bo_unpin(chan->notifier_bo); mutex_unlock(&dev->struct_mutex); - drm_gem_object_handle_unreference_unlocked(chan->notifier_bo->gem); drm_gem_object_unreference_unlocked(chan->notifier_bo->gem); drm_mm_takedown(&chan->notifier_heap); } diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c index 79082d4398ae..2f93d46ae69a 100644 --- a/drivers/gpu/drm/radeon/evergreen.c +++ b/drivers/gpu/drm/radeon/evergreen.c @@ -1137,7 +1137,7 @@ static void evergreen_gpu_init(struct radeon_device *rdev) WREG32(RCU_IND_INDEX, 0x203); efuse_straps_3 = RREG32(RCU_IND_DATA); - efuse_box_bit_127_124 = (u8)(efuse_straps_3 & 0xF0000000) >> 28; + efuse_box_bit_127_124 = (u8)((efuse_straps_3 & 0xF0000000) >> 28); switch(efuse_box_bit_127_124) { case 0x0: @@ -1407,6 +1407,7 @@ int evergreen_mc_init(struct radeon_device *rdev) rdev->mc.mc_vram_size = RREG32(CONFIG_MEMSIZE) * 1024 * 1024; rdev->mc.real_vram_size = RREG32(CONFIG_MEMSIZE) * 1024 * 1024; rdev->mc.visible_vram_size = rdev->mc.aper_size; + rdev->mc.active_vram_size = rdev->mc.visible_vram_size; r600_vram_gtt_location(rdev, &rdev->mc); radeon_update_bandwidth_info(rdev); @@ -1520,7 +1521,7 @@ void evergreen_disable_interrupt_state(struct radeon_device *rdev) { u32 tmp; - WREG32(CP_INT_CNTL, 0); + WREG32(CP_INT_CNTL, CNTX_BUSY_INT_ENABLE | CNTX_EMPTY_INT_ENABLE); WREG32(GRBM_INT_CNTL, 0); WREG32(INT_MASK + EVERGREEN_CRTC0_REGISTER_OFFSET, 0); WREG32(INT_MASK + EVERGREEN_CRTC1_REGISTER_OFFSET, 0); diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c index e151f16a8f86..e59422320bb6 100644 --- a/drivers/gpu/drm/radeon/r100.c +++ b/drivers/gpu/drm/radeon/r100.c @@ -1030,6 +1030,7 @@ int r100_cp_init(struct radeon_device *rdev, unsigned ring_size) return r; } rdev->cp.ready = true; + rdev->mc.active_vram_size = rdev->mc.real_vram_size; return 0; } @@ -1047,6 +1048,7 @@ void r100_cp_fini(struct radeon_device *rdev) void r100_cp_disable(struct radeon_device *rdev) { /* Disable ring */ + rdev->mc.active_vram_size = rdev->mc.visible_vram_size; rdev->cp.ready = false; WREG32(RADEON_CP_CSQ_MODE, 0); WREG32(RADEON_CP_CSQ_CNTL, 0); @@ -2295,6 +2297,7 @@ void r100_vram_init_sizes(struct radeon_device *rdev) /* FIXME we don't use the second aperture yet when we could use it */ if (rdev->mc.visible_vram_size > rdev->mc.aper_size) rdev->mc.visible_vram_size = rdev->mc.aper_size; + rdev->mc.active_vram_size = rdev->mc.visible_vram_size; config_aper_size = RREG32(RADEON_CONFIG_APER_SIZE); if (rdev->flags & RADEON_IS_IGP) { uint32_t tom; diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c index 7a04959ba0ee..7b65e4efe8af 100644 --- a/drivers/gpu/drm/radeon/r600.c +++ b/drivers/gpu/drm/radeon/r600.c @@ -1248,6 +1248,7 @@ int r600_mc_init(struct radeon_device *rdev) rdev->mc.mc_vram_size = RREG32(CONFIG_MEMSIZE); rdev->mc.real_vram_size = RREG32(CONFIG_MEMSIZE); rdev->mc.visible_vram_size = rdev->mc.aper_size; + rdev->mc.active_vram_size = rdev->mc.visible_vram_size; r600_vram_gtt_location(rdev, &rdev->mc); if (rdev->flags & RADEON_IS_IGP) { @@ -1917,6 +1918,7 @@ void r600_pciep_wreg(struct radeon_device *rdev, u32 reg, u32 v) */ void r600_cp_stop(struct radeon_device *rdev) { + rdev->mc.active_vram_size = rdev->mc.visible_vram_size; WREG32(R_0086D8_CP_ME_CNTL, S_0086D8_CP_ME_HALT(1)); } @@ -2910,7 +2912,7 @@ static void r600_disable_interrupt_state(struct radeon_device *rdev) { u32 tmp; - WREG32(CP_INT_CNTL, 0); + WREG32(CP_INT_CNTL, CNTX_BUSY_INT_ENABLE | CNTX_EMPTY_INT_ENABLE); WREG32(GRBM_INT_CNTL, 0); WREG32(DxMODE_INT_MASK, 0); if (ASIC_IS_DCE3(rdev)) { diff --git a/drivers/gpu/drm/radeon/r600_blit_kms.c b/drivers/gpu/drm/radeon/r600_blit_kms.c index 9ceb2a1ce799..3473c00781ff 100644 --- a/drivers/gpu/drm/radeon/r600_blit_kms.c +++ b/drivers/gpu/drm/radeon/r600_blit_kms.c @@ -532,6 +532,7 @@ int r600_blit_init(struct radeon_device *rdev) memcpy(ptr + rdev->r600_blit.ps_offset, r6xx_ps, r6xx_ps_size * 4); radeon_bo_kunmap(rdev->r600_blit.shader_obj); radeon_bo_unreserve(rdev->r600_blit.shader_obj); + rdev->mc.active_vram_size = rdev->mc.real_vram_size; return 0; } @@ -539,6 +540,7 @@ void r600_blit_fini(struct radeon_device *rdev) { int r; + rdev->mc.active_vram_size = rdev->mc.visible_vram_size; if (rdev->r600_blit.shader_obj == NULL) return; /* If we can't reserve the bo, unref should be enough to destroy diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index a168d644bf9e..9ff38c99a6ea 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -344,6 +344,7 @@ struct radeon_mc { * about vram size near mc fb location */ u64 mc_vram_size; u64 visible_vram_size; + u64 active_vram_size; u64 gtt_size; u64 gtt_start; u64 gtt_end; diff --git a/drivers/gpu/drm/radeon/radeon_atombios.c b/drivers/gpu/drm/radeon/radeon_atombios.c index 68932ba7b8a4..8e43ddae70cc 100644 --- a/drivers/gpu/drm/radeon/radeon_atombios.c +++ b/drivers/gpu/drm/radeon/radeon_atombios.c @@ -1558,39 +1558,39 @@ radeon_atombios_get_tv_info(struct radeon_device *rdev) switch (tv_info->ucTV_BootUpDefaultStandard) { case ATOM_TV_NTSC: tv_std = TV_STD_NTSC; - DRM_INFO("Default TV standard: NTSC\n"); + DRM_DEBUG_KMS("Default TV standard: NTSC\n"); break; case ATOM_TV_NTSCJ: tv_std = TV_STD_NTSC_J; - DRM_INFO("Default TV standard: NTSC-J\n"); + DRM_DEBUG_KMS("Default TV standard: NTSC-J\n"); break; case ATOM_TV_PAL: tv_std = TV_STD_PAL; - DRM_INFO("Default TV standard: PAL\n"); + DRM_DEBUG_KMS("Default TV standard: PAL\n"); break; case ATOM_TV_PALM: tv_std = TV_STD_PAL_M; - DRM_INFO("Default TV standard: PAL-M\n"); + DRM_DEBUG_KMS("Default TV standard: PAL-M\n"); break; case ATOM_TV_PALN: tv_std = TV_STD_PAL_N; - DRM_INFO("Default TV standard: PAL-N\n"); + DRM_DEBUG_KMS("Default TV standard: PAL-N\n"); break; case ATOM_TV_PALCN: tv_std = TV_STD_PAL_CN; - DRM_INFO("Default TV standard: PAL-CN\n"); + DRM_DEBUG_KMS("Default TV standard: PAL-CN\n"); break; case ATOM_TV_PAL60: tv_std = TV_STD_PAL_60; - DRM_INFO("Default TV standard: PAL-60\n"); + DRM_DEBUG_KMS("Default TV standard: PAL-60\n"); break; case ATOM_TV_SECAM: tv_std = TV_STD_SECAM; - DRM_INFO("Default TV standard: SECAM\n"); + DRM_DEBUG_KMS("Default TV standard: SECAM\n"); break; default: tv_std = TV_STD_NTSC; - DRM_INFO("Unknown TV standard; defaulting to NTSC\n"); + DRM_DEBUG_KMS("Unknown TV standard; defaulting to NTSC\n"); break; } } diff --git a/drivers/gpu/drm/radeon/radeon_combios.c b/drivers/gpu/drm/radeon/radeon_combios.c index a04b7a6ad95f..7b7ea269549c 100644 --- a/drivers/gpu/drm/radeon/radeon_combios.c +++ b/drivers/gpu/drm/radeon/radeon_combios.c @@ -913,47 +913,47 @@ radeon_combios_get_tv_info(struct radeon_device *rdev) switch (RBIOS8(tv_info + 7) & 0xf) { case 1: tv_std = TV_STD_NTSC; - DRM_INFO("Default TV standard: NTSC\n"); + DRM_DEBUG_KMS("Default TV standard: NTSC\n"); break; case 2: tv_std = TV_STD_PAL; - DRM_INFO("Default TV standard: PAL\n"); + DRM_DEBUG_KMS("Default TV standard: PAL\n"); break; case 3: tv_std = TV_STD_PAL_M; - DRM_INFO("Default TV standard: PAL-M\n"); + DRM_DEBUG_KMS("Default TV standard: PAL-M\n"); break; case 4: tv_std = TV_STD_PAL_60; - DRM_INFO("Default TV standard: PAL-60\n"); + DRM_DEBUG_KMS("Default TV standard: PAL-60\n"); break; case 5: tv_std = TV_STD_NTSC_J; - DRM_INFO("Default TV standard: NTSC-J\n"); + DRM_DEBUG_KMS("Default TV standard: NTSC-J\n"); break; case 6: tv_std = TV_STD_SCART_PAL; - DRM_INFO("Default TV standard: SCART-PAL\n"); + DRM_DEBUG_KMS("Default TV standard: SCART-PAL\n"); break; default: tv_std = TV_STD_NTSC; - DRM_INFO + DRM_DEBUG_KMS ("Unknown TV standard; defaulting to NTSC\n"); break; } switch ((RBIOS8(tv_info + 9) >> 2) & 0x3) { case 0: - DRM_INFO("29.498928713 MHz TV ref clk\n"); + DRM_DEBUG_KMS("29.498928713 MHz TV ref clk\n"); break; case 1: - DRM_INFO("28.636360000 MHz TV ref clk\n"); + DRM_DEBUG_KMS("28.636360000 MHz TV ref clk\n"); break; case 2: - DRM_INFO("14.318180000 MHz TV ref clk\n"); + DRM_DEBUG_KMS("14.318180000 MHz TV ref clk\n"); break; case 3: - DRM_INFO("27.000000000 MHz TV ref clk\n"); + DRM_DEBUG_KMS("27.000000000 MHz TV ref clk\n"); break; default: break; @@ -1324,7 +1324,7 @@ bool radeon_legacy_get_tmds_info_from_combios(struct radeon_encoder *encoder, if (tmds_info) { ver = RBIOS8(tmds_info); - DRM_INFO("DFP table revision: %d\n", ver); + DRM_DEBUG_KMS("DFP table revision: %d\n", ver); if (ver == 3) { n = RBIOS8(tmds_info + 5) + 1; if (n > 4) @@ -1408,7 +1408,7 @@ bool radeon_legacy_get_ext_tmds_info_from_combios(struct radeon_encoder *encoder offset = combios_get_table_offset(dev, COMBIOS_EXT_TMDS_INFO_TABLE); if (offset) { ver = RBIOS8(offset); - DRM_INFO("External TMDS Table revision: %d\n", ver); + DRM_DEBUG_KMS("External TMDS Table revision: %d\n", ver); tmds->slave_addr = RBIOS8(offset + 4 + 2); tmds->slave_addr >>= 1; /* 7 bit addressing */ gpio = RBIOS8(offset + 4 + 3); diff --git a/drivers/gpu/drm/radeon/radeon_cursor.c b/drivers/gpu/drm/radeon/radeon_cursor.c index 5731fc9b1ae3..3eef567b0421 100644 --- a/drivers/gpu/drm/radeon/radeon_cursor.c +++ b/drivers/gpu/drm/radeon/radeon_cursor.c @@ -203,6 +203,7 @@ int radeon_crtc_cursor_move(struct drm_crtc *crtc, struct radeon_crtc *radeon_crtc = to_radeon_crtc(crtc); struct radeon_device *rdev = crtc->dev->dev_private; int xorigin = 0, yorigin = 0; + int w = radeon_crtc->cursor_width; if (x < 0) xorigin = -x + 1; @@ -213,22 +214,7 @@ int radeon_crtc_cursor_move(struct drm_crtc *crtc, if (yorigin >= CURSOR_HEIGHT) yorigin = CURSOR_HEIGHT - 1; - radeon_lock_cursor(crtc, true); - if (ASIC_IS_DCE4(rdev)) { - /* cursors are offset into the total surface */ - x += crtc->x; - y += crtc->y; - DRM_DEBUG("x %d y %d c->x %d c->y %d\n", x, y, crtc->x, crtc->y); - - /* XXX: check if evergreen has the same issues as avivo chips */ - WREG32(EVERGREEN_CUR_POSITION + radeon_crtc->crtc_offset, - ((xorigin ? 0 : x) << 16) | - (yorigin ? 0 : y)); - WREG32(EVERGREEN_CUR_HOT_SPOT + radeon_crtc->crtc_offset, (xorigin << 16) | yorigin); - WREG32(EVERGREEN_CUR_SIZE + radeon_crtc->crtc_offset, - ((radeon_crtc->cursor_width - 1) << 16) | (radeon_crtc->cursor_height - 1)); - } else if (ASIC_IS_AVIVO(rdev)) { - int w = radeon_crtc->cursor_width; + if (ASIC_IS_AVIVO(rdev)) { int i = 0; struct drm_crtc *crtc_p; @@ -260,7 +246,17 @@ int radeon_crtc_cursor_move(struct drm_crtc *crtc, if (w <= 0) w = 1; } + } + radeon_lock_cursor(crtc, true); + if (ASIC_IS_DCE4(rdev)) { + WREG32(EVERGREEN_CUR_POSITION + radeon_crtc->crtc_offset, + ((xorigin ? 0 : x) << 16) | + (yorigin ? 0 : y)); + WREG32(EVERGREEN_CUR_HOT_SPOT + radeon_crtc->crtc_offset, (xorigin << 16) | yorigin); + WREG32(EVERGREEN_CUR_SIZE + radeon_crtc->crtc_offset, + ((w - 1) << 16) | (radeon_crtc->cursor_height - 1)); + } else if (ASIC_IS_AVIVO(rdev)) { WREG32(AVIVO_D1CUR_POSITION + radeon_crtc->crtc_offset, ((xorigin ? 0 : x) << 16) | (yorigin ? 0 : y)); diff --git a/drivers/gpu/drm/radeon/radeon_fb.c b/drivers/gpu/drm/radeon/radeon_fb.c index 9cdf6a35bc2c..40b0c087b592 100644 --- a/drivers/gpu/drm/radeon/radeon_fb.c +++ b/drivers/gpu/drm/radeon/radeon_fb.c @@ -97,7 +97,6 @@ static void radeonfb_destroy_pinned_object(struct drm_gem_object *gobj) radeon_bo_unpin(rbo); radeon_bo_unreserve(rbo); } - drm_gem_object_handle_unreference(gobj); drm_gem_object_unreference_unlocked(gobj); } diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index 0afd1e62347d..b3b5306bb578 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -69,7 +69,7 @@ void radeon_ttm_placement_from_domain(struct radeon_bo *rbo, u32 domain) u32 c = 0; rbo->placement.fpfn = 0; - rbo->placement.lpfn = 0; + rbo->placement.lpfn = rbo->rdev->mc.active_vram_size >> PAGE_SHIFT; rbo->placement.placement = rbo->placements; rbo->placement.busy_placement = rbo->placements; if (domain & RADEON_GEM_DOMAIN_VRAM) diff --git a/drivers/gpu/drm/radeon/radeon_object.h b/drivers/gpu/drm/radeon/radeon_object.h index 353998dc2c03..3481bc7f6f58 100644 --- a/drivers/gpu/drm/radeon/radeon_object.h +++ b/drivers/gpu/drm/radeon/radeon_object.h @@ -124,11 +124,8 @@ static inline int radeon_bo_wait(struct radeon_bo *bo, u32 *mem_type, int r; r = ttm_bo_reserve(&bo->tbo, true, no_wait, false, 0); - if (unlikely(r != 0)) { - if (r != -ERESTARTSYS) - dev_err(bo->rdev->dev, "%p reserve failed for wait\n", bo); + if (unlikely(r != 0)) return r; - } spin_lock(&bo->tbo.lock); if (mem_type) *mem_type = bo->tbo.mem.mem_type; diff --git a/drivers/gpu/drm/radeon/rs600.c b/drivers/gpu/drm/radeon/rs600.c index cc05b230d7ef..51d5f7b5ab21 100644 --- a/drivers/gpu/drm/radeon/rs600.c +++ b/drivers/gpu/drm/radeon/rs600.c @@ -693,6 +693,7 @@ void rs600_mc_init(struct radeon_device *rdev) rdev->mc.real_vram_size = RREG32(RADEON_CONFIG_MEMSIZE); rdev->mc.mc_vram_size = rdev->mc.real_vram_size; rdev->mc.visible_vram_size = rdev->mc.aper_size; + rdev->mc.active_vram_size = rdev->mc.visible_vram_size; rdev->mc.igp_sideport_enabled = radeon_atombios_sideport_present(rdev); base = RREG32_MC(R_000004_MC_FB_LOCATION); base = G_000004_MC_FB_START(base) << 16; diff --git a/drivers/gpu/drm/radeon/rs690.c b/drivers/gpu/drm/radeon/rs690.c index 3e3f75718be3..4dc2a87ea680 100644 --- a/drivers/gpu/drm/radeon/rs690.c +++ b/drivers/gpu/drm/radeon/rs690.c @@ -157,6 +157,7 @@ void rs690_mc_init(struct radeon_device *rdev) rdev->mc.aper_base = pci_resource_start(rdev->pdev, 0); rdev->mc.aper_size = pci_resource_len(rdev->pdev, 0); rdev->mc.visible_vram_size = rdev->mc.aper_size; + rdev->mc.active_vram_size = rdev->mc.visible_vram_size; base = RREG32_MC(R_000100_MCCFG_FB_LOCATION); base = G_000100_MC_FB_START(base) << 16; rdev->mc.igp_sideport_enabled = radeon_atombios_sideport_present(rdev); diff --git a/drivers/gpu/drm/radeon/rv770.c b/drivers/gpu/drm/radeon/rv770.c index bfa59db374d2..9490da700749 100644 --- a/drivers/gpu/drm/radeon/rv770.c +++ b/drivers/gpu/drm/radeon/rv770.c @@ -267,6 +267,7 @@ static void rv770_mc_program(struct radeon_device *rdev) */ void r700_cp_stop(struct radeon_device *rdev) { + rdev->mc.active_vram_size = rdev->mc.visible_vram_size; WREG32(CP_ME_CNTL, (CP_ME_HALT | CP_PFP_HALT)); } @@ -992,6 +993,7 @@ int rv770_mc_init(struct radeon_device *rdev) rdev->mc.mc_vram_size = RREG32(CONFIG_MEMSIZE); rdev->mc.real_vram_size = RREG32(CONFIG_MEMSIZE); rdev->mc.visible_vram_size = rdev->mc.aper_size; + rdev->mc.active_vram_size = rdev->mc.visible_vram_size; r600_vram_gtt_location(rdev, &rdev->mc); radeon_update_bandwidth_info(rdev); diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index cb4cf7ef4d1e..db809e034cc4 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -442,6 +442,43 @@ out_err: } /** + * Call bo::reserved and with the lru lock held. + * Will release GPU memory type usage on destruction. + * This is the place to put in driver specific hooks. + * Will release the bo::reserved lock and the + * lru lock on exit. + */ + +static void ttm_bo_cleanup_memtype_use(struct ttm_buffer_object *bo) +{ + struct ttm_bo_global *glob = bo->glob; + + if (bo->ttm) { + + /** + * Release the lru_lock, since we don't want to have + * an atomic requirement on ttm_tt[unbind|destroy]. + */ + + spin_unlock(&glob->lru_lock); + ttm_tt_unbind(bo->ttm); + ttm_tt_destroy(bo->ttm); + bo->ttm = NULL; + spin_lock(&glob->lru_lock); + } + + if (bo->mem.mm_node) { + drm_mm_put_block(bo->mem.mm_node); + bo->mem.mm_node = NULL; + } + + atomic_set(&bo->reserved, 0); + wake_up_all(&bo->event_queue); + spin_unlock(&glob->lru_lock); +} + + +/** * If bo idle, remove from delayed- and lru lists, and unref. * If not idle, and already on delayed list, do nothing. * If not idle, and not on delayed list, put on delayed list, @@ -456,6 +493,7 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo, bool remove_all) int ret; spin_lock(&bo->lock); +retry: (void) ttm_bo_wait(bo, false, false, !remove_all); if (!bo->sync_obj) { @@ -464,31 +502,52 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo, bool remove_all) spin_unlock(&bo->lock); spin_lock(&glob->lru_lock); - put_count = ttm_bo_del_from_lru(bo); + ret = ttm_bo_reserve_locked(bo, false, !remove_all, false, 0); + + /** + * Someone else has the object reserved. Bail and retry. + */ - ret = ttm_bo_reserve_locked(bo, false, false, false, 0); - BUG_ON(ret); - if (bo->ttm) - ttm_tt_unbind(bo->ttm); + if (unlikely(ret == -EBUSY)) { + spin_unlock(&glob->lru_lock); + spin_lock(&bo->lock); + goto requeue; + } + + /** + * We can re-check for sync object without taking + * the bo::lock since setting the sync object requires + * also bo::reserved. A busy object at this point may + * be caused by another thread starting an accelerated + * eviction. + */ + + if (unlikely(bo->sync_obj)) { + atomic_set(&bo->reserved, 0); + wake_up_all(&bo->event_queue); + spin_unlock(&glob->lru_lock); + spin_lock(&bo->lock); + if (remove_all) + goto retry; + else + goto requeue; + } + + put_count = ttm_bo_del_from_lru(bo); if (!list_empty(&bo->ddestroy)) { list_del_init(&bo->ddestroy); ++put_count; } - if (bo->mem.mm_node) { - drm_mm_put_block(bo->mem.mm_node); - bo->mem.mm_node = NULL; - } - spin_unlock(&glob->lru_lock); - atomic_set(&bo->reserved, 0); + ttm_bo_cleanup_memtype_use(bo); while (put_count--) kref_put(&bo->list_kref, ttm_bo_ref_bug); return 0; } - +requeue: spin_lock(&glob->lru_lock); if (list_empty(&bo->ddestroy)) { void *sync_obj = bo->sync_obj; diff --git a/drivers/hid/hid-cando.c b/drivers/hid/hid-cando.c index 4267a6fdc277..5925bdcd417d 100644 --- a/drivers/hid/hid-cando.c +++ b/drivers/hid/hid-cando.c @@ -237,6 +237,8 @@ static const struct hid_device_id cando_devices[] = { USB_DEVICE_ID_CANDO_MULTI_TOUCH) }, { HID_USB_DEVICE(USB_VENDOR_ID_CANDO, USB_DEVICE_ID_CANDO_MULTI_TOUCH_11_6) }, + { HID_USB_DEVICE(USB_VENDOR_ID_CANDO, + USB_DEVICE_ID_CANDO_MULTI_TOUCH_15_6) }, { } }; MODULE_DEVICE_TABLE(hid, cando_devices); diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 3f7292486024..a0dea3d1296e 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1292,6 +1292,7 @@ static const struct hid_device_id hid_blacklist[] = { { HID_USB_DEVICE(USB_VENDOR_ID_BTC, USB_DEVICE_ID_BTC_EMPREX_REMOTE_2) }, { HID_USB_DEVICE(USB_VENDOR_ID_CANDO, USB_DEVICE_ID_CANDO_MULTI_TOUCH) }, { HID_USB_DEVICE(USB_VENDOR_ID_CANDO, USB_DEVICE_ID_CANDO_MULTI_TOUCH_11_6) }, + { HID_USB_DEVICE(USB_VENDOR_ID_CANDO, USB_DEVICE_ID_CANDO_MULTI_TOUCH_15_6) }, { HID_USB_DEVICE(USB_VENDOR_ID_CHERRY, USB_DEVICE_ID_CHERRY_CYMOTION) }, { HID_USB_DEVICE(USB_VENDOR_ID_CHERRY, USB_DEVICE_ID_CHERRY_CYMOTION_SOLAR) }, { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_TACTICAL_PAD) }, diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 765a4f53eb5c..c5ae5f1545bd 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -134,6 +134,7 @@ #define USB_VENDOR_ID_CANDO 0x2087 #define USB_DEVICE_ID_CANDO_MULTI_TOUCH 0x0a01 #define USB_DEVICE_ID_CANDO_MULTI_TOUCH_11_6 0x0b03 +#define USB_DEVICE_ID_CANDO_MULTI_TOUCH_15_6 0x0f01 #define USB_VENDOR_ID_CH 0x068e #define USB_DEVICE_ID_CH_PRO_PEDALS 0x00f2 @@ -503,6 +504,7 @@ #define USB_VENDOR_ID_TURBOX 0x062a #define USB_DEVICE_ID_TURBOX_KEYBOARD 0x0201 +#define USB_DEVICE_ID_TURBOX_TOUCHSCREEN_MOSART 0x7100 #define USB_VENDOR_ID_TWINHAN 0x6253 #define USB_DEVICE_ID_TWINHAN_IR_REMOTE 0x0100 diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index 47d70c523d93..a3866b5c0c43 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -109,6 +109,12 @@ static ssize_t hidraw_write(struct file *file, const char __user *buffer, size_t int ret = 0; mutex_lock(&minors_lock); + + if (!hidraw_table[minor]) { + ret = -ENODEV; + goto out; + } + dev = hidraw_table[minor]->hid; if (!dev->hid_output_raw_report) { @@ -244,6 +250,10 @@ static long hidraw_ioctl(struct file *file, unsigned int cmd, mutex_lock(&minors_lock); dev = hidraw_table[minor]; + if (!dev) { + ret = -ENODEV; + goto out; + } switch (cmd) { case HIDIOCGRDESCSIZE: @@ -317,6 +327,7 @@ static long hidraw_ioctl(struct file *file, unsigned int cmd, ret = -ENOTTY; } +out: mutex_unlock(&minors_lock); return ret; } diff --git a/drivers/hid/usbhid/hid-quirks.c b/drivers/hid/usbhid/hid-quirks.c index 70da3181c8a0..f0260c699adb 100644 --- a/drivers/hid/usbhid/hid-quirks.c +++ b/drivers/hid/usbhid/hid-quirks.c @@ -36,6 +36,7 @@ static const struct hid_blacklist { { USB_VENDOR_ID_DWAV, USB_DEVICE_ID_EGALAX_TOUCHCONTROLLER, HID_QUIRK_MULTI_INPUT | HID_QUIRK_NOGET }, { USB_VENDOR_ID_DWAV, USB_DEVICE_ID_DWAV_EGALAX_MULTITOUCH, HID_QUIRK_MULTI_INPUT }, { USB_VENDOR_ID_MOJO, USB_DEVICE_ID_RETRO_ADAPTER, HID_QUIRK_MULTI_INPUT }, + { USB_VENDOR_ID_TURBOX, USB_DEVICE_ID_TURBOX_TOUCHSCREEN_MOSART, HID_QUIRK_MULTI_INPUT }, { USB_VENDOR_ID_HAPP, USB_DEVICE_ID_UGCI_DRIVING, HID_QUIRK_BADPAD | HID_QUIRK_MULTI_INPUT }, { USB_VENDOR_ID_HAPP, USB_DEVICE_ID_UGCI_FLYING, HID_QUIRK_BADPAD | HID_QUIRK_MULTI_INPUT }, { USB_VENDOR_ID_HAPP, USB_DEVICE_ID_UGCI_FIGHTING, HID_QUIRK_BADPAD | HID_QUIRK_MULTI_INPUT }, diff --git a/drivers/i2c/busses/i2c-cpm.c b/drivers/i2c/busses/i2c-cpm.c index f7bd2613cecc..f2de3be35df3 100644 --- a/drivers/i2c/busses/i2c-cpm.c +++ b/drivers/i2c/busses/i2c-cpm.c @@ -677,6 +677,11 @@ static int __devinit cpm_i2c_probe(struct platform_device *ofdev, dev_dbg(&ofdev->dev, "hw routines for %s registered.\n", cpm->adap.name); + /* + * register OF I2C devices + */ + of_i2c_register_devices(&cpm->adap); + return 0; out_shut: cpm_i2c_shutdown(cpm); diff --git a/drivers/i2c/busses/i2c-davinci.c b/drivers/i2c/busses/i2c-davinci.c index b8feac5f2ef4..5795c8398c7c 100644 --- a/drivers/i2c/busses/i2c-davinci.c +++ b/drivers/i2c/busses/i2c-davinci.c @@ -331,21 +331,16 @@ i2c_davinci_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg, int stop) INIT_COMPLETION(dev->cmd_complete); dev->cmd_err = 0; - /* Take I2C out of reset, configure it as master and set the - * start bit */ - flag = DAVINCI_I2C_MDR_IRS | DAVINCI_I2C_MDR_MST | DAVINCI_I2C_MDR_STT; + /* Take I2C out of reset and configure it as master */ + flag = DAVINCI_I2C_MDR_IRS | DAVINCI_I2C_MDR_MST; /* if the slave address is ten bit address, enable XA bit */ if (msg->flags & I2C_M_TEN) flag |= DAVINCI_I2C_MDR_XA; if (!(msg->flags & I2C_M_RD)) flag |= DAVINCI_I2C_MDR_TRX; - if (stop) - flag |= DAVINCI_I2C_MDR_STP; - if (msg->len == 0) { + if (msg->len == 0) flag |= DAVINCI_I2C_MDR_RM; - flag &= ~DAVINCI_I2C_MDR_STP; - } /* Enable receive or transmit interrupts */ w = davinci_i2c_read_reg(dev, DAVINCI_I2C_IMR_REG); @@ -358,17 +353,28 @@ i2c_davinci_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg, int stop) dev->terminate = 0; /* + * Write mode register first as needed for correct behaviour + * on OMAP-L138, but don't set STT yet to avoid a race with XRDY + * occuring before we have loaded DXR + */ + davinci_i2c_write_reg(dev, DAVINCI_I2C_MDR_REG, flag); + + /* * First byte should be set here, not after interrupt, * because transmit-data-ready interrupt can come before * NACK-interrupt during sending of previous message and * ICDXR may have wrong data + * It also saves us one interrupt, slightly faster */ if ((!(msg->flags & I2C_M_RD)) && dev->buf_len) { davinci_i2c_write_reg(dev, DAVINCI_I2C_DXR_REG, *dev->buf++); dev->buf_len--; } - /* write the data into mode register; start transmitting */ + /* Set STT to begin transmit now DXR is loaded */ + flag |= DAVINCI_I2C_MDR_STT; + if (stop && msg->len != 0) + flag |= DAVINCI_I2C_MDR_STP; davinci_i2c_write_reg(dev, DAVINCI_I2C_MDR_REG, flag); r = wait_for_completion_interruptible_timeout(&dev->cmd_complete, diff --git a/drivers/i2c/busses/i2c-ibm_iic.c b/drivers/i2c/busses/i2c-ibm_iic.c index 43ca32fddde2..89eedf45d30e 100644 --- a/drivers/i2c/busses/i2c-ibm_iic.c +++ b/drivers/i2c/busses/i2c-ibm_iic.c @@ -761,6 +761,9 @@ static int __devinit iic_probe(struct platform_device *ofdev, dev_info(&ofdev->dev, "using %s mode\n", dev->fast_mode ? "fast (400 kHz)" : "standard (100 kHz)"); + /* Now register all the child nodes */ + of_i2c_register_devices(adap); + return 0; error_cleanup: diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index d1ff9408dc1f..4c2a62b75b5c 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -159,15 +159,9 @@ static int i2c_imx_bus_busy(struct imx_i2c_struct *i2c_imx, int for_busy) static int i2c_imx_trx_complete(struct imx_i2c_struct *i2c_imx) { - int result; - - result = wait_event_interruptible_timeout(i2c_imx->queue, - i2c_imx->i2csr & I2SR_IIF, HZ / 10); + wait_event_timeout(i2c_imx->queue, i2c_imx->i2csr & I2SR_IIF, HZ / 10); - if (unlikely(result < 0)) { - dev_dbg(&i2c_imx->adapter.dev, "<%s> result < 0\n", __func__); - return result; - } else if (unlikely(!(i2c_imx->i2csr & I2SR_IIF))) { + if (unlikely(!(i2c_imx->i2csr & I2SR_IIF))) { dev_dbg(&i2c_imx->adapter.dev, "<%s> Timeout\n", __func__); return -ETIMEDOUT; } @@ -295,7 +289,7 @@ static irqreturn_t i2c_imx_isr(int irq, void *dev_id) i2c_imx->i2csr = temp; temp &= ~I2SR_IIF; writeb(temp, i2c_imx->base + IMX_I2C_I2SR); - wake_up_interruptible(&i2c_imx->queue); + wake_up(&i2c_imx->queue); return IRQ_HANDLED; } diff --git a/drivers/i2c/busses/i2c-mpc.c b/drivers/i2c/busses/i2c-mpc.c index a1c419a716af..b74e6dc6886c 100644 --- a/drivers/i2c/busses/i2c-mpc.c +++ b/drivers/i2c/busses/i2c-mpc.c @@ -632,6 +632,7 @@ static int __devinit fsl_i2c_probe(struct platform_device *op, dev_err(i2c->dev, "failed to add adapter\n"); goto fail_add; } + of_i2c_register_devices(&i2c->adap); return result; diff --git a/drivers/i2c/busses/i2c-pca-isa.c b/drivers/i2c/busses/i2c-pca-isa.c index bbd77603a417..29933f87d8fa 100644 --- a/drivers/i2c/busses/i2c-pca-isa.c +++ b/drivers/i2c/busses/i2c-pca-isa.c @@ -71,8 +71,8 @@ static int pca_isa_readbyte(void *pd, int reg) static int pca_isa_waitforcompletion(void *pd) { - long ret = ~0; unsigned long timeout; + long ret; if (irq > -1) { ret = wait_event_timeout(pca_wait, @@ -81,11 +81,15 @@ static int pca_isa_waitforcompletion(void *pd) } else { /* Do polling */ timeout = jiffies + pca_isa_ops.timeout; - while (((pca_isa_readbyte(pd, I2C_PCA_CON) - & I2C_PCA_CON_SI) == 0) - && (ret = time_before(jiffies, timeout))) + do { + ret = time_before(jiffies, timeout); + if (pca_isa_readbyte(pd, I2C_PCA_CON) + & I2C_PCA_CON_SI) + break; udelay(100); + } while (ret); } + return ret > 0; } diff --git a/drivers/i2c/busses/i2c-pca-platform.c b/drivers/i2c/busses/i2c-pca-platform.c index ef5c78487eb7..5f6d7f89e225 100644 --- a/drivers/i2c/busses/i2c-pca-platform.c +++ b/drivers/i2c/busses/i2c-pca-platform.c @@ -80,8 +80,8 @@ static void i2c_pca_pf_writebyte32(void *pd, int reg, int val) static int i2c_pca_pf_waitforcompletion(void *pd) { struct i2c_pca_pf_data *i2c = pd; - long ret = ~0; unsigned long timeout; + long ret; if (i2c->irq) { ret = wait_event_timeout(i2c->wait, @@ -90,10 +90,13 @@ static int i2c_pca_pf_waitforcompletion(void *pd) } else { /* Do polling */ timeout = jiffies + i2c->adap.timeout; - while (((i2c->algo_data.read_byte(i2c, I2C_PCA_CON) - & I2C_PCA_CON_SI) == 0) - && (ret = time_before(jiffies, timeout))) + do { + ret = time_before(jiffies, timeout); + if (i2c->algo_data.read_byte(i2c, I2C_PCA_CON) + & I2C_PCA_CON_SI) + break; udelay(100); + } while (ret); } return ret > 0; diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c index 6649176de940..bea4c5021d26 100644 --- a/drivers/i2c/i2c-core.c +++ b/drivers/i2c/i2c-core.c @@ -32,7 +32,6 @@ #include <linux/init.h> #include <linux/idr.h> #include <linux/mutex.h> -#include <linux/of_i2c.h> #include <linux/of_device.h> #include <linux/completion.h> #include <linux/hardirq.h> @@ -197,11 +196,12 @@ static int i2c_device_pm_suspend(struct device *dev) { const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - if (pm_runtime_suspended(dev)) - return 0; - - if (pm) - return pm->suspend ? pm->suspend(dev) : 0; + if (pm) { + if (pm_runtime_suspended(dev)) + return 0; + else + return pm->suspend ? pm->suspend(dev) : 0; + } return i2c_legacy_suspend(dev, PMSG_SUSPEND); } @@ -216,12 +216,6 @@ static int i2c_device_pm_resume(struct device *dev) else ret = i2c_legacy_resume(dev); - if (!ret) { - pm_runtime_disable(dev); - pm_runtime_set_active(dev); - pm_runtime_enable(dev); - } - return ret; } @@ -229,11 +223,12 @@ static int i2c_device_pm_freeze(struct device *dev) { const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - if (pm_runtime_suspended(dev)) - return 0; - - if (pm) - return pm->freeze ? pm->freeze(dev) : 0; + if (pm) { + if (pm_runtime_suspended(dev)) + return 0; + else + return pm->freeze ? pm->freeze(dev) : 0; + } return i2c_legacy_suspend(dev, PMSG_FREEZE); } @@ -242,11 +237,12 @@ static int i2c_device_pm_thaw(struct device *dev) { const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - if (pm_runtime_suspended(dev)) - return 0; - - if (pm) - return pm->thaw ? pm->thaw(dev) : 0; + if (pm) { + if (pm_runtime_suspended(dev)) + return 0; + else + return pm->thaw ? pm->thaw(dev) : 0; + } return i2c_legacy_resume(dev); } @@ -255,11 +251,12 @@ static int i2c_device_pm_poweroff(struct device *dev) { const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - if (pm_runtime_suspended(dev)) - return 0; - - if (pm) - return pm->poweroff ? pm->poweroff(dev) : 0; + if (pm) { + if (pm_runtime_suspended(dev)) + return 0; + else + return pm->poweroff ? pm->poweroff(dev) : 0; + } return i2c_legacy_suspend(dev, PMSG_HIBERNATE); } @@ -876,9 +873,6 @@ static int i2c_register_adapter(struct i2c_adapter *adap) if (adap->nr < __i2c_first_dynamic_bus_num) i2c_scan_static_board_info(adap); - /* Register devices from the device tree */ - of_i2c_register_devices(adap); - /* Notify drivers */ mutex_lock(&core_lock); bus_for_each_drv(&i2c_bus_type, NULL, adap, __process_new_adapter); diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 0906fc5b69b9..c37ef64d1465 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -157,13 +157,13 @@ static struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = { { /* MWAIT C5 */ }, { /* MWAIT C6 */ .name = "ATM-C6", - .desc = "MWAIT 0x40", - .driver_data = (void *) 0x40, + .desc = "MWAIT 0x52", + .driver_data = (void *) 0x52, .flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 200, + .exit_latency = 140, .power_usage = 150, - .target_residency = 800, - .enter = NULL }, /* disabled */ + .target_residency = 560, + .enter = &intel_idle }, }; /** diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c index c908c5f83645..af9ee313c10b 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -28,7 +28,7 @@ struct evdev { int minor; struct input_handle handle; wait_queue_head_t wait; - struct evdev_client *grab; + struct evdev_client __rcu *grab; struct list_head client_list; spinlock_t client_lock; /* protects client_list */ struct mutex mutex; @@ -669,6 +669,9 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd, if ((_IOC_NR(cmd) & ~ABS_MAX) == _IOC_NR(EVIOCGABS(0))) { + if (!dev->absinfo) + return -EINVAL; + t = _IOC_NR(cmd) & ABS_MAX; abs = dev->absinfo[t]; @@ -680,10 +683,13 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd, } } - if (_IOC_DIR(cmd) == _IOC_READ) { + if (_IOC_DIR(cmd) == _IOC_WRITE) { if ((_IOC_NR(cmd) & ~ABS_MAX) == _IOC_NR(EVIOCSABS(0))) { + if (!dev->absinfo) + return -EINVAL; + t = _IOC_NR(cmd) & ABS_MAX; if (copy_from_user(&abs, p, min_t(size_t, diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c index d85bd8a7967d..22239e988498 100644 --- a/drivers/input/joydev.c +++ b/drivers/input/joydev.c @@ -483,6 +483,9 @@ static int joydev_handle_JSIOCSAXMAP(struct joydev *joydev, memcpy(joydev->abspam, abspam, len); + for (i = 0; i < joydev->nabs; i++) + joydev->absmap[joydev->abspam[i]] = i; + out: kfree(abspam); return retval; diff --git a/drivers/input/misc/hp_sdc_rtc.c b/drivers/input/misc/hp_sdc_rtc.c index c19066479057..7e2c12a5b839 100644 --- a/drivers/input/misc/hp_sdc_rtc.c +++ b/drivers/input/misc/hp_sdc_rtc.c @@ -104,7 +104,7 @@ static int hp_sdc_rtc_do_read_bbrtc (struct rtc_time *rtctm) t.endidx = 91; t.seq = tseq; t.act.semaphore = &tsem; - init_MUTEX_LOCKED(&tsem); + sema_init(&tsem, 0); if (hp_sdc_enqueue_transaction(&t)) return -1; @@ -698,7 +698,7 @@ static int __init hp_sdc_rtc_init(void) return -ENODEV; #endif - init_MUTEX(&i8042tregs); + sema_init(&i8042tregs, 1); if ((ret = hp_sdc_request_timer_irq(&hp_sdc_rtc_isr))) return ret; diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c index 0d4266a533a5..360698553eb5 100644 --- a/drivers/input/misc/uinput.c +++ b/drivers/input/misc/uinput.c @@ -404,6 +404,13 @@ static int uinput_setup_device(struct uinput_device *udev, const char __user *bu retval = uinput_validate_absbits(dev); if (retval < 0) goto exit; + if (test_bit(ABS_MT_SLOT, dev->absbit)) { + int nslot = input_abs_get_max(dev, ABS_MT_SLOT) + 1; + input_mt_create_slots(dev, nslot); + input_set_events_per_packet(dev, 6 * nslot); + } else if (test_bit(ABS_MT_POSITION_X, dev->absbit)) { + input_set_events_per_packet(dev, 60); + } } udev->state = UIST_SETUP_COMPLETE; diff --git a/drivers/input/serio/hil_mlc.c b/drivers/input/serio/hil_mlc.c index c92f4edfee7b..e5624d8f1709 100644 --- a/drivers/input/serio/hil_mlc.c +++ b/drivers/input/serio/hil_mlc.c @@ -915,15 +915,15 @@ int hil_mlc_register(hil_mlc *mlc) mlc->ostarted = 0; rwlock_init(&mlc->lock); - init_MUTEX(&mlc->osem); + sema_init(&mlc->osem, 1); - init_MUTEX(&mlc->isem); + sema_init(&mlc->isem, 1); mlc->icount = -1; mlc->imatch = 0; mlc->opercnt = 0; - init_MUTEX_LOCKED(&(mlc->csem)); + sema_init(&(mlc->csem), 0); hil_mlc_clear_di_scratch(mlc); hil_mlc_clear_di_map(mlc, 0); diff --git a/drivers/input/serio/hp_sdc.c b/drivers/input/serio/hp_sdc.c index bcc2d30ec245..8c0b51c31424 100644 --- a/drivers/input/serio/hp_sdc.c +++ b/drivers/input/serio/hp_sdc.c @@ -905,7 +905,7 @@ static int __init hp_sdc_init(void) ts_sync[1] = 0x0f; ts_sync[2] = ts_sync[3] = ts_sync[4] = ts_sync[5] = 0; t_sync.act.semaphore = &s_sync; - init_MUTEX_LOCKED(&s_sync); + sema_init(&s_sync, 0); hp_sdc_enqueue_transaction(&t_sync); down(&s_sync); /* Wait for t_sync to complete */ @@ -1039,7 +1039,7 @@ static int __init hp_sdc_register(void) return hp_sdc.dev_err; } - init_MUTEX_LOCKED(&tq_init_sem); + sema_init(&tq_init_sem, 0); tq_init.actidx = 0; tq_init.idx = 1; diff --git a/drivers/input/tablet/wacom_sys.c b/drivers/input/tablet/wacom_sys.c index 42ba3691d908..b35876ee6908 100644 --- a/drivers/input/tablet/wacom_sys.c +++ b/drivers/input/tablet/wacom_sys.c @@ -103,27 +103,26 @@ static void wacom_sys_irq(struct urb *urb) static int wacom_open(struct input_dev *dev) { struct wacom *wacom = input_get_drvdata(dev); + int retval = 0; - mutex_lock(&wacom->lock); - - wacom->irq->dev = wacom->usbdev; - - if (usb_autopm_get_interface(wacom->intf) < 0) { - mutex_unlock(&wacom->lock); + if (usb_autopm_get_interface(wacom->intf) < 0) return -EIO; - } + + mutex_lock(&wacom->lock); if (usb_submit_urb(wacom->irq, GFP_KERNEL)) { - usb_autopm_put_interface(wacom->intf); - mutex_unlock(&wacom->lock); - return -EIO; + retval = -EIO; + goto out; } wacom->open = true; wacom->intf->needs_remote_wakeup = 1; +out: mutex_unlock(&wacom->lock); - return 0; + if (retval) + usb_autopm_put_interface(wacom->intf); + return retval; } static void wacom_close(struct input_dev *dev) @@ -135,6 +134,8 @@ static void wacom_close(struct input_dev *dev) wacom->open = false; wacom->intf->needs_remote_wakeup = 0; mutex_unlock(&wacom->lock); + + usb_autopm_put_interface(wacom->intf); } static int wacom_parse_hid(struct usb_interface *intf, struct hid_descriptor *hid_desc, diff --git a/drivers/input/tablet/wacom_wac.c b/drivers/input/tablet/wacom_wac.c index 6e29badb969e..47fd7a041c52 100644 --- a/drivers/input/tablet/wacom_wac.c +++ b/drivers/input/tablet/wacom_wac.c @@ -442,8 +442,10 @@ static void wacom_intuos_general(struct wacom_wac *wacom) /* general pen packet */ if ((data[1] & 0xb8) == 0xa0) { t = (data[6] << 2) | ((data[7] >> 6) & 3); - if (features->type >= INTUOS4S && features->type <= INTUOS4L) + if ((features->type >= INTUOS4S && features->type <= INTUOS4L) || + features->type == WACOM_21UX2) { t = (t << 1) | (data[1] & 1); + } input_report_abs(input, ABS_PRESSURE, t); input_report_abs(input, ABS_TILT_X, ((data[7] << 1) & 0x7e) | (data[8] >> 7)); diff --git a/drivers/isdn/sc/interrupt.c b/drivers/isdn/sc/interrupt.c index 485be8b1e1b3..f0225bc0f267 100644 --- a/drivers/isdn/sc/interrupt.c +++ b/drivers/isdn/sc/interrupt.c @@ -112,11 +112,19 @@ irqreturn_t interrupt_handler(int dummy, void *card_inst) } else if(callid>=0x0000 && callid<=0x7FFF) { + int len; + pr_debug("%s: Got Incoming Call\n", sc_adapter[card]->devicename); - strcpy(setup.phone,&(rcvmsg.msg_data.byte_array[4])); - strcpy(setup.eazmsn, - sc_adapter[card]->channel[rcvmsg.phy_link_no-1].dn); + len = strlcpy(setup.phone, &(rcvmsg.msg_data.byte_array[4]), + sizeof(setup.phone)); + if (len >= sizeof(setup.phone)) + continue; + len = strlcpy(setup.eazmsn, + sc_adapter[card]->channel[rcvmsg.phy_link_no - 1].dn, + sizeof(setup.eazmsn)); + if (len >= sizeof(setup.eazmsn)) + continue; setup.si1 = 7; setup.si2 = 0; setup.plan = 0; @@ -176,7 +184,9 @@ irqreturn_t interrupt_handler(int dummy, void *card_inst) * Handle a GetMyNumber Rsp */ if (IS_CE_MESSAGE(rcvmsg,Call,0,GetMyNumber)){ - strcpy(sc_adapter[card]->channel[rcvmsg.phy_link_no-1].dn,rcvmsg.msg_data.byte_array); + strlcpy(sc_adapter[card]->channel[rcvmsg.phy_link_no - 1].dn, + rcvmsg.msg_data.byte_array, + sizeof(rcvmsg.msg_data.byte_array)); continue; } diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c index 1c4ee6e77937..bf64e49d996a 100644 --- a/drivers/macintosh/adb.c +++ b/drivers/macintosh/adb.c @@ -83,7 +83,7 @@ static struct adb_driver *adb_controller; BLOCKING_NOTIFIER_HEAD(adb_client_list); static int adb_got_sleep; static int adb_inited; -static DECLARE_MUTEX(adb_probe_mutex); +static DEFINE_SEMAPHORE(adb_probe_mutex); static int sleepy_trackpad; static int autopoll_devs; int __adb_probe_sync; diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index ed4900ade93a..e4fb58db5454 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1000,10 +1000,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) page = bitmap->sb_page; offset = sizeof(bitmap_super_t); if (!file) - read_sb_page(bitmap->mddev, - bitmap->mddev->bitmap_info.offset, - page, - index, count); + page = read_sb_page( + bitmap->mddev, + bitmap->mddev->bitmap_info.offset, + page, + index, count); } else if (file) { page = read_page(file, index, bitmap, count); offset = 0; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ad83a4dcadc3..0b830bbe1d8b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1839,7 +1839,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* take from bio_init */ bio->bi_next = NULL; + bio->bi_flags &= ~(BIO_POOL_MASK-1); bio->bi_flags |= 1 << BIO_UPTODATE; + bio->bi_comp_cpu = -1; bio->bi_rw = READ; bio->bi_vcnt = 0; bio->bi_idx = 0; @@ -1912,7 +1914,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) break; BUG_ON(sync_blocks < (PAGE_SIZE>>9)); - if (len > (sync_blocks<<9)) + if ((len >> 9) > sync_blocks) len = sync_blocks<<9; } diff --git a/drivers/media/IR/ir-keytable.c b/drivers/media/IR/ir-keytable.c index 7e82a9df726b..7961d59f5cac 100644 --- a/drivers/media/IR/ir-keytable.c +++ b/drivers/media/IR/ir-keytable.c @@ -319,7 +319,7 @@ static void ir_timer_keyup(unsigned long cookie) * a keyup event might follow immediately after the keydown. */ spin_lock_irqsave(&ir->keylock, flags); - if (time_is_after_eq_jiffies(ir->keyup_jiffies)) + if (time_is_before_eq_jiffies(ir->keyup_jiffies)) ir_keyup(ir); spin_unlock_irqrestore(&ir->keylock, flags); } @@ -510,6 +510,13 @@ int __ir_input_register(struct input_dev *input_dev, (ir_dev->props && ir_dev->props->driver_type == RC_DRIVER_IR_RAW) ? " in raw mode" : ""); + /* + * Default delay of 250ms is too short for some protocols, expecially + * since the timeout is currently set to 250ms. Increase it to 500ms, + * to avoid wrong repetition of the keycodes. + */ + input_dev->rep[REP_DELAY] = 500; + return 0; out_event: diff --git a/drivers/media/IR/ir-lirc-codec.c b/drivers/media/IR/ir-lirc-codec.c index 77b5946413c0..e63f757d5d72 100644 --- a/drivers/media/IR/ir-lirc-codec.c +++ b/drivers/media/IR/ir-lirc-codec.c @@ -267,7 +267,7 @@ static int ir_lirc_register(struct input_dev *input_dev) features |= LIRC_CAN_SET_SEND_CARRIER; if (ir_dev->props->s_tx_duty_cycle) - features |= LIRC_CAN_SET_REC_DUTY_CYCLE; + features |= LIRC_CAN_SET_SEND_DUTY_CYCLE; } if (ir_dev->props->s_rx_carrier_range) diff --git a/drivers/media/IR/ir-raw-event.c b/drivers/media/IR/ir-raw-event.c index 43094e7eccfa..8e0e1b1f8c87 100644 --- a/drivers/media/IR/ir-raw-event.c +++ b/drivers/media/IR/ir-raw-event.c @@ -279,9 +279,11 @@ int ir_raw_event_register(struct input_dev *input_dev) "rc%u", (unsigned int)ir->devno); if (IS_ERR(ir->raw->thread)) { + int ret = PTR_ERR(ir->raw->thread); + kfree(ir->raw); ir->raw = NULL; - return PTR_ERR(ir->raw->thread); + return ret; } mutex_lock(&ir_raw_handler_lock); diff --git a/drivers/media/IR/ir-sysfs.c b/drivers/media/IR/ir-sysfs.c index 96dafc425c8e..46d42467f9b4 100644 --- a/drivers/media/IR/ir-sysfs.c +++ b/drivers/media/IR/ir-sysfs.c @@ -67,13 +67,14 @@ static ssize_t show_protocols(struct device *d, char *tmp = buf; int i; - if (ir_dev->props->driver_type == RC_DRIVER_SCANCODE) { + if (ir_dev->props && ir_dev->props->driver_type == RC_DRIVER_SCANCODE) { enabled = ir_dev->rc_tab.ir_type; allowed = ir_dev->props->allowed_protos; - } else { + } else if (ir_dev->raw) { enabled = ir_dev->raw->enabled_protocols; allowed = ir_raw_get_allowed_protocols(); - } + } else + return sprintf(tmp, "[builtin]\n"); IR_dprintk(1, "allowed - 0x%llx, enabled - 0x%llx\n", (long long)allowed, @@ -121,10 +122,14 @@ static ssize_t store_protocols(struct device *d, int rc, i, count = 0; unsigned long flags; - if (ir_dev->props->driver_type == RC_DRIVER_SCANCODE) + if (ir_dev->props && ir_dev->props->driver_type == RC_DRIVER_SCANCODE) type = ir_dev->rc_tab.ir_type; - else + else if (ir_dev->raw) type = ir_dev->raw->enabled_protocols; + else { + IR_dprintk(1, "Protocol switching not supported\n"); + return -EINVAL; + } while ((tmp = strsep((char **) &data, " \n")) != NULL) { if (!*tmp) @@ -185,7 +190,7 @@ static ssize_t store_protocols(struct device *d, } } - if (ir_dev->props->driver_type == RC_DRIVER_SCANCODE) { + if (ir_dev->props && ir_dev->props->driver_type == RC_DRIVER_SCANCODE) { spin_lock_irqsave(&ir_dev->rc_tab.lock, flags); ir_dev->rc_tab.ir_type = type; spin_unlock_irqrestore(&ir_dev->rc_tab.lock, flags); diff --git a/drivers/media/IR/keymaps/rc-rc6-mce.c b/drivers/media/IR/keymaps/rc-rc6-mce.c index 64264f7f838f..39557ad401b6 100644 --- a/drivers/media/IR/keymaps/rc-rc6-mce.c +++ b/drivers/media/IR/keymaps/rc-rc6-mce.c @@ -19,6 +19,7 @@ static struct ir_scancode rc6_mce[] = { { 0x800f0416, KEY_PLAY }, { 0x800f0418, KEY_PAUSE }, + { 0x800f046e, KEY_PLAYPAUSE }, { 0x800f0419, KEY_STOP }, { 0x800f0417, KEY_RECORD }, @@ -37,6 +38,8 @@ static struct ir_scancode rc6_mce[] = { { 0x800f0411, KEY_VOLUMEDOWN }, { 0x800f0412, KEY_CHANNELUP }, { 0x800f0413, KEY_CHANNELDOWN }, + { 0x800f043a, KEY_BRIGHTNESSUP }, + { 0x800f0480, KEY_BRIGHTNESSDOWN }, { 0x800f0401, KEY_NUMERIC_1 }, { 0x800f0402, KEY_NUMERIC_2 }, diff --git a/drivers/media/IR/mceusb.c b/drivers/media/IR/mceusb.c index ac6bb2c01a48..bc620e10ef77 100644 --- a/drivers/media/IR/mceusb.c +++ b/drivers/media/IR/mceusb.c @@ -120,6 +120,10 @@ static struct usb_device_id mceusb_dev_table[] = { { USB_DEVICE(VENDOR_PHILIPS, 0x0613) }, /* Philips eHome Infrared Transceiver */ { USB_DEVICE(VENDOR_PHILIPS, 0x0815) }, + /* Philips/Spinel plus IR transceiver for ASUS */ + { USB_DEVICE(VENDOR_PHILIPS, 0x206c) }, + /* Philips/Spinel plus IR transceiver for ASUS */ + { USB_DEVICE(VENDOR_PHILIPS, 0x2088) }, /* Realtek MCE IR Receiver */ { USB_DEVICE(VENDOR_REALTEK, 0x0161) }, /* SMK/Toshiba G83C0004D410 */ diff --git a/drivers/media/dvb/dvb-usb/dib0700_core.c b/drivers/media/dvb/dvb-usb/dib0700_core.c index fe818348b8a3..48397f103d32 100644 --- a/drivers/media/dvb/dvb-usb/dib0700_core.c +++ b/drivers/media/dvb/dvb-usb/dib0700_core.c @@ -673,9 +673,6 @@ static int dib0700_probe(struct usb_interface *intf, else dev->props.rc.core.bulk_mode = false; - /* Need a higher delay, to avoid wrong repeat */ - dev->rc_input_dev->rep[REP_DELAY] = 500; - dib0700_rc_setup(dev); return 0; diff --git a/drivers/media/dvb/dvb-usb/dib0700_devices.c b/drivers/media/dvb/dvb-usb/dib0700_devices.c index f634d2e784b2..e06acd1fecb6 100644 --- a/drivers/media/dvb/dvb-usb/dib0700_devices.c +++ b/drivers/media/dvb/dvb-usb/dib0700_devices.c @@ -940,6 +940,58 @@ static int stk7070p_frontend_attach(struct dvb_usb_adapter *adap) return adap->fe == NULL ? -ENODEV : 0; } +/* STK7770P */ +static struct dib7000p_config dib7770p_dib7000p_config = { + .output_mpeg2_in_188_bytes = 1, + + .agc_config_count = 1, + .agc = &dib7070_agc_config, + .bw = &dib7070_bw_config_12_mhz, + .tuner_is_baseband = 1, + .spur_protect = 1, + + .gpio_dir = DIB7000P_GPIO_DEFAULT_DIRECTIONS, + .gpio_val = DIB7000P_GPIO_DEFAULT_VALUES, + .gpio_pwm_pos = DIB7000P_GPIO_DEFAULT_PWM_POS, + + .hostbus_diversity = 1, + .enable_current_mirror = 1, + .disable_sample_and_hold = 0, +}; + +static int stk7770p_frontend_attach(struct dvb_usb_adapter *adap) +{ + struct usb_device_descriptor *p = &adap->dev->udev->descriptor; + if (p->idVendor == cpu_to_le16(USB_VID_PINNACLE) && + p->idProduct == cpu_to_le16(USB_PID_PINNACLE_PCTV72E)) + dib0700_set_gpio(adap->dev, GPIO6, GPIO_OUT, 0); + else + dib0700_set_gpio(adap->dev, GPIO6, GPIO_OUT, 1); + msleep(10); + dib0700_set_gpio(adap->dev, GPIO9, GPIO_OUT, 1); + dib0700_set_gpio(adap->dev, GPIO4, GPIO_OUT, 1); + dib0700_set_gpio(adap->dev, GPIO7, GPIO_OUT, 1); + dib0700_set_gpio(adap->dev, GPIO10, GPIO_OUT, 0); + + dib0700_ctrl_clock(adap->dev, 72, 1); + + msleep(10); + dib0700_set_gpio(adap->dev, GPIO10, GPIO_OUT, 1); + msleep(10); + dib0700_set_gpio(adap->dev, GPIO0, GPIO_OUT, 1); + + if (dib7000p_i2c_enumeration(&adap->dev->i2c_adap, 1, 18, + &dib7770p_dib7000p_config) != 0) { + err("%s: dib7000p_i2c_enumeration failed. Cannot continue\n", + __func__); + return -ENODEV; + } + + adap->fe = dvb_attach(dib7000p_attach, &adap->dev->i2c_adap, 0x80, + &dib7770p_dib7000p_config); + return adap->fe == NULL ? -ENODEV : 0; +} + /* DIB807x generic */ static struct dibx000_agc_config dib807x_agc_config[2] = { { @@ -1781,7 +1833,7 @@ struct usb_device_id dib0700_usb_id_table[] = { /* 60 */{ USB_DEVICE(USB_VID_TERRATEC, USB_PID_TERRATEC_CINERGY_T_XXS_2) }, { USB_DEVICE(USB_VID_DIBCOM, USB_PID_DIBCOM_STK807XPVR) }, { USB_DEVICE(USB_VID_DIBCOM, USB_PID_DIBCOM_STK807XP) }, - { USB_DEVICE(USB_VID_PIXELVIEW, USB_PID_PIXELVIEW_SBTVD) }, + { USB_DEVICE_VER(USB_VID_PIXELVIEW, USB_PID_PIXELVIEW_SBTVD, 0x000, 0x3f00) }, { USB_DEVICE(USB_VID_EVOLUTEPC, USB_PID_TVWAY_PLUS) }, /* 65 */{ USB_DEVICE(USB_VID_PINNACLE, USB_PID_PINNACLE_PCTV73ESE) }, { USB_DEVICE(USB_VID_PINNACLE, USB_PID_PINNACLE_PCTV282E) }, @@ -2406,7 +2458,7 @@ struct dvb_usb_device_properties dib0700_devices[] = { .pid_filter_count = 32, .pid_filter = stk70x0p_pid_filter, .pid_filter_ctrl = stk70x0p_pid_filter_ctrl, - .frontend_attach = stk7070p_frontend_attach, + .frontend_attach = stk7770p_frontend_attach, .tuner_attach = dib7770p_tuner_attach, DIB0700_DEFAULT_STREAMING_CONFIG(0x02), diff --git a/drivers/media/dvb/dvb-usb/opera1.c b/drivers/media/dvb/dvb-usb/opera1.c index 6b22ec64ab0c..f896337b4535 100644 --- a/drivers/media/dvb/dvb-usb/opera1.c +++ b/drivers/media/dvb/dvb-usb/opera1.c @@ -483,9 +483,7 @@ static int opera1_xilinx_load_firmware(struct usb_device *dev, } } kfree(p); - if (fw) { - release_firmware(fw); - } + release_firmware(fw); return ret; } diff --git a/drivers/media/dvb/frontends/dib7000p.c b/drivers/media/dvb/frontends/dib7000p.c index 2e28b973dfd3..3aed0d433921 100644 --- a/drivers/media/dvb/frontends/dib7000p.c +++ b/drivers/media/dvb/frontends/dib7000p.c @@ -260,6 +260,9 @@ static void dib7000p_set_adc_state(struct dib7000p_state *state, enum dibx000_ad // dprintk( "908: %x, 909: %x\n", reg_908, reg_909); + reg_909 |= (state->cfg.disable_sample_and_hold & 1) << 4; + reg_908 |= (state->cfg.enable_current_mirror & 1) << 7; + dib7000p_write_word(state, 908, reg_908); dib7000p_write_word(state, 909, reg_909); } @@ -778,7 +781,10 @@ static void dib7000p_set_channel(struct dib7000p_state *state, struct dvb_fronte default: case GUARD_INTERVAL_1_32: value *= 1; break; } - state->div_sync_wait = (value * 3) / 2 + 32; // add 50% SFN margin + compensate for one DVSY-fifo TODO + if (state->cfg.diversity_delay == 0) + state->div_sync_wait = (value * 3) / 2 + 48; // add 50% SFN margin + compensate for one DVSY-fifo + else + state->div_sync_wait = (value * 3) / 2 + state->cfg.diversity_delay; // add 50% SFN margin + compensate for one DVSY-fifo /* deactive the possibility of diversity reception if extended interleaver */ state->div_force_off = !1 && ch->u.ofdm.transmission_mode != TRANSMISSION_MODE_8K; diff --git a/drivers/media/dvb/frontends/dib7000p.h b/drivers/media/dvb/frontends/dib7000p.h index 805dd13a97ee..da17345bf5bd 100644 --- a/drivers/media/dvb/frontends/dib7000p.h +++ b/drivers/media/dvb/frontends/dib7000p.h @@ -33,6 +33,11 @@ struct dib7000p_config { int (*agc_control) (struct dvb_frontend *, u8 before); u8 output_mode; + u8 disable_sample_and_hold : 1; + + u8 enable_current_mirror : 1; + u8 diversity_delay; + }; #define DEFAULT_DIB7000P_I2C_ADDRESS 18 diff --git a/drivers/media/dvb/siano/smscoreapi.c b/drivers/media/dvb/siano/smscoreapi.c index d93468cd3a85..ff3b0fa901b3 100644 --- a/drivers/media/dvb/siano/smscoreapi.c +++ b/drivers/media/dvb/siano/smscoreapi.c @@ -1098,33 +1098,26 @@ EXPORT_SYMBOL_GPL(smscore_onresponse); * * @return pointer to descriptor on success, NULL on error. */ -struct smscore_buffer_t *smscore_getbuffer(struct smscore_device_t *coredev) + +struct smscore_buffer_t *get_entry(struct smscore_device_t *coredev) { struct smscore_buffer_t *cb = NULL; unsigned long flags; - DEFINE_WAIT(wait); - spin_lock_irqsave(&coredev->bufferslock, flags); - - /* This function must return a valid buffer, since the buffer list is - * finite, we check that there is an available buffer, if not, we wait - * until such buffer become available. - */ - - prepare_to_wait(&coredev->buffer_mng_waitq, &wait, TASK_INTERRUPTIBLE); - if (list_empty(&coredev->buffers)) { - spin_unlock_irqrestore(&coredev->bufferslock, flags); - schedule(); - spin_lock_irqsave(&coredev->bufferslock, flags); + if (!list_empty(&coredev->buffers)) { + cb = (struct smscore_buffer_t *) coredev->buffers.next; + list_del(&cb->entry); } + spin_unlock_irqrestore(&coredev->bufferslock, flags); + return cb; +} - finish_wait(&coredev->buffer_mng_waitq, &wait); - - cb = (struct smscore_buffer_t *) coredev->buffers.next; - list_del(&cb->entry); +struct smscore_buffer_t *smscore_getbuffer(struct smscore_device_t *coredev) +{ + struct smscore_buffer_t *cb = NULL; - spin_unlock_irqrestore(&coredev->bufferslock, flags); + wait_event(coredev->buffer_mng_waitq, (cb = get_entry(coredev))); return cb; } diff --git a/drivers/media/radio/si470x/radio-si470x-i2c.c b/drivers/media/radio/si470x/radio-si470x-i2c.c index 67a4ec8768a6..4ce541a5eb47 100644 --- a/drivers/media/radio/si470x/radio-si470x-i2c.c +++ b/drivers/media/radio/si470x/radio-si470x-i2c.c @@ -395,7 +395,7 @@ static int __devinit si470x_i2c_probe(struct i2c_client *client, radio->registers[POWERCFG] = POWERCFG_ENABLE; if (si470x_set_register(radio, POWERCFG) < 0) { retval = -EIO; - goto err_all; + goto err_video; } msleep(110); diff --git a/drivers/media/video/cx231xx/Makefile b/drivers/media/video/cx231xx/Makefile index 755dd0ce65ff..6f2b57384488 100644 --- a/drivers/media/video/cx231xx/Makefile +++ b/drivers/media/video/cx231xx/Makefile @@ -11,4 +11,5 @@ EXTRA_CFLAGS += -Idrivers/media/video EXTRA_CFLAGS += -Idrivers/media/common/tuners EXTRA_CFLAGS += -Idrivers/media/dvb/dvb-core EXTRA_CFLAGS += -Idrivers/media/dvb/frontends +EXTRA_CFLAGS += -Idrivers/media/dvb/dvb-usb diff --git a/drivers/media/video/cx231xx/cx231xx-cards.c b/drivers/media/video/cx231xx/cx231xx-cards.c index 6bdc0ef18119..f2a4900014bc 100644 --- a/drivers/media/video/cx231xx/cx231xx-cards.c +++ b/drivers/media/video/cx231xx/cx231xx-cards.c @@ -32,6 +32,7 @@ #include <media/v4l2-chip-ident.h> #include <media/cx25840.h> +#include "dvb-usb-ids.h" #include "xc5000.h" #include "cx231xx.h" @@ -175,6 +176,8 @@ struct usb_device_id cx231xx_id_table[] = { .driver_info = CX231XX_BOARD_CNXT_RDE_250}, {USB_DEVICE(0x0572, 0x58A1), .driver_info = CX231XX_BOARD_CNXT_RDU_250}, + {USB_DEVICE_VER(USB_VID_PIXELVIEW, USB_PID_PIXELVIEW_SBTVD, 0x4000,0x4fff), + .driver_info = CX231XX_BOARD_UNKNOWN}, {}, }; @@ -226,14 +229,16 @@ void cx231xx_pre_card_setup(struct cx231xx *dev) dev->board.name, dev->model); /* set the direction for GPIO pins */ - cx231xx_set_gpio_direction(dev, dev->board.tuner_gpio->bit, 1); - cx231xx_set_gpio_value(dev, dev->board.tuner_gpio->bit, 1); - cx231xx_set_gpio_direction(dev, dev->board.tuner_sif_gpio, 1); + if (dev->board.tuner_gpio) { + cx231xx_set_gpio_direction(dev, dev->board.tuner_gpio->bit, 1); + cx231xx_set_gpio_value(dev, dev->board.tuner_gpio->bit, 1); + cx231xx_set_gpio_direction(dev, dev->board.tuner_sif_gpio, 1); - /* request some modules if any required */ + /* request some modules if any required */ - /* reset the Tuner */ - cx231xx_gpio_set(dev, dev->board.tuner_gpio); + /* reset the Tuner */ + cx231xx_gpio_set(dev, dev->board.tuner_gpio); + } /* set the mode to Analog mode initially */ cx231xx_set_mode(dev, CX231XX_ANALOG_MODE); diff --git a/drivers/media/video/cx25840/cx25840-core.c b/drivers/media/video/cx25840/cx25840-core.c index 86ca8c2359dd..f5a3e74c3c7c 100644 --- a/drivers/media/video/cx25840/cx25840-core.c +++ b/drivers/media/video/cx25840/cx25840-core.c @@ -1996,7 +1996,7 @@ static int cx25840_probe(struct i2c_client *client, state->volume = v4l2_ctrl_new_std(&state->hdl, &cx25840_audio_ctrl_ops, V4L2_CID_AUDIO_VOLUME, - 0, 65335, 65535 / 100, default_volume); + 0, 65535, 65535 / 100, default_volume); state->mute = v4l2_ctrl_new_std(&state->hdl, &cx25840_audio_ctrl_ops, V4L2_CID_AUDIO_MUTE, 0, 1, 1, 0); diff --git a/drivers/media/video/cx88/Kconfig b/drivers/media/video/cx88/Kconfig index 99dbae117591..0fa85cbefbb1 100644 --- a/drivers/media/video/cx88/Kconfig +++ b/drivers/media/video/cx88/Kconfig @@ -17,7 +17,7 @@ config VIDEO_CX88 config VIDEO_CX88_ALSA tristate "Conexant 2388x DMA audio support" - depends on VIDEO_CX88 && SND && EXPERIMENTAL + depends on VIDEO_CX88 && SND select SND_PCM ---help--- This is a video4linux driver for direct (DMA) audio on diff --git a/drivers/media/video/gspca/gspca.c b/drivers/media/video/gspca/gspca.c index b9846106913e..78abc1c1f9d5 100644 --- a/drivers/media/video/gspca/gspca.c +++ b/drivers/media/video/gspca/gspca.c @@ -223,6 +223,7 @@ static int alloc_and_submit_int_urb(struct gspca_dev *gspca_dev, usb_rcvintpipe(dev, ep->bEndpointAddress), buffer, buffer_len, int_irq, (void *)gspca_dev, interval); + urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; gspca_dev->int_urb = urb; ret = usb_submit_urb(urb, GFP_KERNEL); if (ret < 0) { diff --git a/drivers/media/video/gspca/sn9c20x.c b/drivers/media/video/gspca/sn9c20x.c index 83a718f0f3f9..9052d5702556 100644 --- a/drivers/media/video/gspca/sn9c20x.c +++ b/drivers/media/video/gspca/sn9c20x.c @@ -2357,8 +2357,7 @@ static void sd_pkt_scan(struct gspca_dev *gspca_dev, (data[33] << 10); avg_lum >>= 9; atomic_set(&sd->avg_lum, avg_lum); - gspca_frame_add(gspca_dev, LAST_PACKET, - data, len); + gspca_frame_add(gspca_dev, LAST_PACKET, NULL, 0); return; } if (gspca_dev->last_packet_type == LAST_PACKET) { diff --git a/drivers/media/video/ivtv/ivtvfb.c b/drivers/media/video/ivtv/ivtvfb.c index be03a712731c..f0316d02f09f 100644 --- a/drivers/media/video/ivtv/ivtvfb.c +++ b/drivers/media/video/ivtv/ivtvfb.c @@ -466,6 +466,8 @@ static int ivtvfb_ioctl(struct fb_info *info, unsigned int cmd, unsigned long ar struct fb_vblank vblank; u32 trace; + memset(&vblank, 0, sizeof(struct fb_vblank)); + vblank.flags = FB_VBLANK_HAVE_COUNT |FB_VBLANK_HAVE_VCOUNT | FB_VBLANK_HAVE_VSYNC; trace = read_reg(IVTV_REG_DEC_LINE_FIELD) >> 16; diff --git a/drivers/media/video/mem2mem_testdev.c b/drivers/media/video/mem2mem_testdev.c index 4525335f9bd4..a7210d981388 100644 --- a/drivers/media/video/mem2mem_testdev.c +++ b/drivers/media/video/mem2mem_testdev.c @@ -239,7 +239,7 @@ static int device_process(struct m2mtest_ctx *ctx, return -EFAULT; } - if (in_buf->vb.size < out_buf->vb.size) { + if (in_buf->vb.size > out_buf->vb.size) { v4l2_err(&dev->v4l2_dev, "Output buffer is too small\n"); return -EINVAL; } @@ -1014,6 +1014,7 @@ static int m2mtest_remove(struct platform_device *pdev) v4l2_m2m_release(dev->m2m_dev); del_timer_sync(&dev->timer); video_unregister_device(dev->vfd); + video_device_release(dev->vfd); v4l2_device_unregister(&dev->v4l2_dev); kfree(dev); diff --git a/drivers/media/video/mt9m111.c b/drivers/media/video/mt9m111.c index 758a4db27d65..c71af4e0e517 100644 --- a/drivers/media/video/mt9m111.c +++ b/drivers/media/video/mt9m111.c @@ -447,6 +447,9 @@ static int mt9m111_s_crop(struct v4l2_subdev *sd, struct v4l2_crop *a) dev_dbg(&client->dev, "%s left=%d, top=%d, width=%d, height=%d\n", __func__, rect.left, rect.top, rect.width, rect.height); + if (a->type != V4L2_BUF_TYPE_VIDEO_CAPTURE) + return -EINVAL; + ret = mt9m111_make_rect(client, &rect); if (!ret) mt9m111->rect = rect; @@ -466,12 +469,14 @@ static int mt9m111_g_crop(struct v4l2_subdev *sd, struct v4l2_crop *a) static int mt9m111_cropcap(struct v4l2_subdev *sd, struct v4l2_cropcap *a) { + if (a->type != V4L2_BUF_TYPE_VIDEO_CAPTURE) + return -EINVAL; + a->bounds.left = MT9M111_MIN_DARK_COLS; a->bounds.top = MT9M111_MIN_DARK_ROWS; a->bounds.width = MT9M111_MAX_WIDTH; a->bounds.height = MT9M111_MAX_HEIGHT; a->defrect = a->bounds; - a->type = V4L2_BUF_TYPE_VIDEO_CAPTURE; a->pixelaspect.numerator = 1; a->pixelaspect.denominator = 1; @@ -487,6 +492,7 @@ static int mt9m111_g_fmt(struct v4l2_subdev *sd, mf->width = mt9m111->rect.width; mf->height = mt9m111->rect.height; mf->code = mt9m111->fmt->code; + mf->colorspace = mt9m111->fmt->colorspace; mf->field = V4L2_FIELD_NONE; return 0; diff --git a/drivers/media/video/mt9v022.c b/drivers/media/video/mt9v022.c index e7cd23cd6394..b48473c7896b 100644 --- a/drivers/media/video/mt9v022.c +++ b/drivers/media/video/mt9v022.c @@ -402,9 +402,6 @@ static int mt9v022_s_fmt(struct v4l2_subdev *sd, if (mt9v022->model != V4L2_IDENT_MT9V022IX7ATC) return -EINVAL; break; - case 0: - /* No format change, only geometry */ - break; default: return -EINVAL; } diff --git a/drivers/media/video/mx2_camera.c b/drivers/media/video/mx2_camera.c index 66ff174151b5..b6ea67221d1d 100644 --- a/drivers/media/video/mx2_camera.c +++ b/drivers/media/video/mx2_camera.c @@ -378,6 +378,9 @@ static void mx25_camera_frame_done(struct mx2_camera_dev *pcdev, int fb, spin_lock_irqsave(&pcdev->lock, flags); + if (*fb_active == NULL) + goto out; + vb = &(*fb_active)->vb; dev_dbg(pcdev->dev, "%s (vb=0x%p) 0x%08lx %d\n", __func__, vb, vb->baddr, vb->bsize); @@ -402,6 +405,7 @@ static void mx25_camera_frame_done(struct mx2_camera_dev *pcdev, int fb, *fb_active = buf; +out: spin_unlock_irqrestore(&pcdev->lock, flags); } diff --git a/drivers/media/video/pvrusb2/pvrusb2-ctrl.c b/drivers/media/video/pvrusb2/pvrusb2-ctrl.c index 1b992b847198..55ea914c7fcd 100644 --- a/drivers/media/video/pvrusb2/pvrusb2-ctrl.c +++ b/drivers/media/video/pvrusb2/pvrusb2-ctrl.c @@ -513,7 +513,7 @@ int pvr2_ctrl_sym_to_value(struct pvr2_ctrl *cptr, if (ret >= 0) { ret = pvr2_ctrl_range_check(cptr,*valptr); } - if (maskptr) *maskptr = ~0; + *maskptr = ~0; } else if (cptr->info->type == pvr2_ctl_bool) { ret = parse_token(ptr,len,valptr,boolNames, ARRAY_SIZE(boolNames)); @@ -522,7 +522,7 @@ int pvr2_ctrl_sym_to_value(struct pvr2_ctrl *cptr, } else if (ret == 0) { *valptr = (*valptr & 1) ? !0 : 0; } - if (maskptr) *maskptr = 1; + *maskptr = 1; } else if (cptr->info->type == pvr2_ctl_enum) { ret = parse_token( ptr,len,valptr, @@ -531,7 +531,7 @@ int pvr2_ctrl_sym_to_value(struct pvr2_ctrl *cptr, if (ret >= 0) { ret = pvr2_ctrl_range_check(cptr,*valptr); } - if (maskptr) *maskptr = ~0; + *maskptr = ~0; } else if (cptr->info->type == pvr2_ctl_bitmask) { ret = parse_tlist( ptr,len,maskptr,valptr, diff --git a/drivers/media/video/s5p-fimc/fimc-core.c b/drivers/media/video/s5p-fimc/fimc-core.c index b151c7be8a50..6961c55baf9b 100644 --- a/drivers/media/video/s5p-fimc/fimc-core.c +++ b/drivers/media/video/s5p-fimc/fimc-core.c @@ -393,6 +393,37 @@ static void fimc_set_yuv_order(struct fimc_ctx *ctx) dbg("ctx->out_order_1p= %d", ctx->out_order_1p); } +static void fimc_prepare_dma_offset(struct fimc_ctx *ctx, struct fimc_frame *f) +{ + struct samsung_fimc_variant *variant = ctx->fimc_dev->variant; + + f->dma_offset.y_h = f->offs_h; + if (!variant->pix_hoff) + f->dma_offset.y_h *= (f->fmt->depth >> 3); + + f->dma_offset.y_v = f->offs_v; + + f->dma_offset.cb_h = f->offs_h; + f->dma_offset.cb_v = f->offs_v; + + f->dma_offset.cr_h = f->offs_h; + f->dma_offset.cr_v = f->offs_v; + + if (!variant->pix_hoff) { + if (f->fmt->planes_cnt == 3) { + f->dma_offset.cb_h >>= 1; + f->dma_offset.cr_h >>= 1; + } + if (f->fmt->color == S5P_FIMC_YCBCR420) { + f->dma_offset.cb_v >>= 1; + f->dma_offset.cr_v >>= 1; + } + } + + dbg("in_offset: color= %d, y_h= %d, y_v= %d", + f->fmt->color, f->dma_offset.y_h, f->dma_offset.y_v); +} + /** * fimc_prepare_config - check dimensions, operation and color mode * and pre-calculate offset and the scaling coefficients. @@ -406,7 +437,6 @@ static int fimc_prepare_config(struct fimc_ctx *ctx, u32 flags) { struct fimc_frame *s_frame, *d_frame; struct fimc_vid_buffer *buf = NULL; - struct samsung_fimc_variant *variant = ctx->fimc_dev->variant; int ret = 0; s_frame = &ctx->s_frame; @@ -419,61 +449,16 @@ static int fimc_prepare_config(struct fimc_ctx *ctx, u32 flags) swap(d_frame->width, d_frame->height); } - /* Prepare the output offset ratios for scaler. */ - d_frame->dma_offset.y_h = d_frame->offs_h; - if (!variant->pix_hoff) - d_frame->dma_offset.y_h *= (d_frame->fmt->depth >> 3); - - d_frame->dma_offset.y_v = d_frame->offs_v; - - d_frame->dma_offset.cb_h = d_frame->offs_h; - d_frame->dma_offset.cb_v = d_frame->offs_v; - - d_frame->dma_offset.cr_h = d_frame->offs_h; - d_frame->dma_offset.cr_v = d_frame->offs_v; + /* Prepare the DMA offset ratios for scaler. */ + fimc_prepare_dma_offset(ctx, &ctx->s_frame); + fimc_prepare_dma_offset(ctx, &ctx->d_frame); - if (!variant->pix_hoff && d_frame->fmt->planes_cnt == 3) { - d_frame->dma_offset.cb_h >>= 1; - d_frame->dma_offset.cb_v >>= 1; - d_frame->dma_offset.cr_h >>= 1; - d_frame->dma_offset.cr_v >>= 1; - } - - dbg("out offset: color= %d, y_h= %d, y_v= %d", - d_frame->fmt->color, - d_frame->dma_offset.y_h, d_frame->dma_offset.y_v); - - /* Prepare the input offset ratios for scaler. */ - s_frame->dma_offset.y_h = s_frame->offs_h; - if (!variant->pix_hoff) - s_frame->dma_offset.y_h *= (s_frame->fmt->depth >> 3); - s_frame->dma_offset.y_v = s_frame->offs_v; - - s_frame->dma_offset.cb_h = s_frame->offs_h; - s_frame->dma_offset.cb_v = s_frame->offs_v; - - s_frame->dma_offset.cr_h = s_frame->offs_h; - s_frame->dma_offset.cr_v = s_frame->offs_v; - - if (!variant->pix_hoff && s_frame->fmt->planes_cnt == 3) { - s_frame->dma_offset.cb_h >>= 1; - s_frame->dma_offset.cb_v >>= 1; - s_frame->dma_offset.cr_h >>= 1; - s_frame->dma_offset.cr_v >>= 1; - } - - dbg("in offset: color= %d, y_h= %d, y_v= %d", - s_frame->fmt->color, s_frame->dma_offset.y_h, - s_frame->dma_offset.y_v); - - fimc_set_yuv_order(ctx); - - /* Check against the scaler ratio. */ if (s_frame->height > (SCALER_MAX_VRATIO * d_frame->height) || s_frame->width > (SCALER_MAX_HRATIO * d_frame->width)) { err("out of scaler range"); return -EINVAL; } + fimc_set_yuv_order(ctx); } /* Input DMA mode is not allowed when the scaler is disabled. */ @@ -822,7 +807,8 @@ static int fimc_m2m_s_fmt(struct file *file, void *priv, struct v4l2_format *f) } else { v4l2_err(&ctx->fimc_dev->m2m.v4l2_dev, "Wrong buffer/video queue type (%d)\n", f->type); - return -EINVAL; + ret = -EINVAL; + goto s_fmt_out; } pix = &f->fmt.pix; @@ -1414,8 +1400,10 @@ static int fimc_probe(struct platform_device *pdev) } fimc->work_queue = create_workqueue(dev_name(&fimc->pdev->dev)); - if (!fimc->work_queue) + if (!fimc->work_queue) { + ret = -ENOMEM; goto err_irq; + } ret = fimc_register_m2m_device(fimc); if (ret) @@ -1492,6 +1480,7 @@ static struct samsung_fimc_variant fimc2_variant_s5p = { }; static struct samsung_fimc_variant fimc01_variant_s5pv210 = { + .pix_hoff = 1, .has_inp_rot = 1, .has_out_rot = 1, .min_inp_pixsize = 16, @@ -1506,6 +1495,7 @@ static struct samsung_fimc_variant fimc01_variant_s5pv210 = { }; static struct samsung_fimc_variant fimc2_variant_s5pv210 = { + .pix_hoff = 1, .min_inp_pixsize = 16, .min_out_pixsize = 32, diff --git a/drivers/media/video/saa7134/saa7134-cards.c b/drivers/media/video/saa7134/saa7134-cards.c index ec697fcd406e..bb8d83d8ddaf 100644 --- a/drivers/media/video/saa7134/saa7134-cards.c +++ b/drivers/media/video/saa7134/saa7134-cards.c @@ -4323,13 +4323,13 @@ struct saa7134_board saa7134_boards[] = { }, [SAA7134_BOARD_BEHOLD_COLUMBUS_TVFM] = { /* Beholder Intl. Ltd. 2008 */ - /*Dmitry Belimov <d.belimov@gmail.com> */ - .name = "Beholder BeholdTV Columbus TVFM", + /* Dmitry Belimov <d.belimov@gmail.com> */ + .name = "Beholder BeholdTV Columbus TV/FM", .audio_clock = 0x00187de7, .tuner_type = TUNER_ALPS_TSBE5_PAL, - .radio_type = UNSET, - .tuner_addr = ADDR_UNSET, - .radio_addr = ADDR_UNSET, + .radio_type = TUNER_TEA5767, + .tuner_addr = 0xc2 >> 1, + .radio_addr = 0xc0 >> 1, .tda9887_conf = TDA9887_PRESENT, .gpiomask = 0x000A8004, .inputs = {{ diff --git a/drivers/media/video/saa7164/saa7164-buffer.c b/drivers/media/video/saa7164/saa7164-buffer.c index 5713f3a4b76c..ddd25d32723d 100644 --- a/drivers/media/video/saa7164/saa7164-buffer.c +++ b/drivers/media/video/saa7164/saa7164-buffer.c @@ -136,10 +136,11 @@ ret: int saa7164_buffer_dealloc(struct saa7164_tsport *port, struct saa7164_buffer *buf) { - struct saa7164_dev *dev = port->dev; + struct saa7164_dev *dev; - if ((buf == 0) || (port == 0)) + if (!buf || !port) return SAA_ERR_BAD_PARAMETER; + dev = port->dev; dprintk(DBGLVL_BUF, "%s() deallocating buffer @ 0x%p\n", __func__, buf); diff --git a/drivers/media/video/uvc/uvc_driver.c b/drivers/media/video/uvc/uvc_driver.c index 8bdd940f32e6..2ac85d8984f0 100644 --- a/drivers/media/video/uvc/uvc_driver.c +++ b/drivers/media/video/uvc/uvc_driver.c @@ -486,6 +486,12 @@ static int uvc_parse_format(struct uvc_device *dev, max(frame->dwFrameInterval[0], frame->dwDefaultFrameInterval)); + if (dev->quirks & UVC_QUIRK_RESTRICT_FRAME_RATE) { + frame->bFrameIntervalType = 1; + frame->dwFrameInterval[0] = + frame->dwDefaultFrameInterval; + } + uvc_trace(UVC_TRACE_DESCR, "- %ux%u (%u.%u fps)\n", frame->wWidth, frame->wHeight, 10000000/frame->dwDefaultFrameInterval, @@ -2026,6 +2032,15 @@ static struct usb_device_id uvc_ids[] = { .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0 }, + /* Chicony CNF7129 (Asus EEE 100HE) */ + { .match_flags = USB_DEVICE_ID_MATCH_DEVICE + | USB_DEVICE_ID_MATCH_INT_INFO, + .idVendor = 0x04f2, + .idProduct = 0xb071, + .bInterfaceClass = USB_CLASS_VIDEO, + .bInterfaceSubClass = 1, + .bInterfaceProtocol = 0, + .driver_info = UVC_QUIRK_RESTRICT_FRAME_RATE }, /* Alcor Micro AU3820 (Future Boy PC USB Webcam) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, @@ -2091,6 +2106,15 @@ static struct usb_device_id uvc_ids[] = { .bInterfaceProtocol = 0, .driver_info = UVC_QUIRK_PROBE_MINMAX | UVC_QUIRK_PROBE_DEF }, + /* IMC Networks (Medion Akoya) */ + { .match_flags = USB_DEVICE_ID_MATCH_DEVICE + | USB_DEVICE_ID_MATCH_INT_INFO, + .idVendor = 0x13d3, + .idProduct = 0x5103, + .bInterfaceClass = USB_CLASS_VIDEO, + .bInterfaceSubClass = 1, + .bInterfaceProtocol = 0, + .driver_info = UVC_QUIRK_STREAM_NO_FID }, /* Syntek (HP Spartan) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, diff --git a/drivers/media/video/uvc/uvcvideo.h b/drivers/media/video/uvc/uvcvideo.h index bdacf3beabf5..892e0e51916c 100644 --- a/drivers/media/video/uvc/uvcvideo.h +++ b/drivers/media/video/uvc/uvcvideo.h @@ -182,6 +182,7 @@ struct uvc_xu_control { #define UVC_QUIRK_IGNORE_SELECTOR_UNIT 0x00000020 #define UVC_QUIRK_FIX_BANDWIDTH 0x00000080 #define UVC_QUIRK_PROBE_DEF 0x00000100 +#define UVC_QUIRK_RESTRICT_FRAME_RATE 0x00000200 /* Format flags */ #define UVC_FMT_FLAG_COMPRESSED 0x00000001 diff --git a/drivers/media/video/v4l2-compat-ioctl32.c b/drivers/media/video/v4l2-compat-ioctl32.c index 073f01390cdd..86294ed35c9b 100644 --- a/drivers/media/video/v4l2-compat-ioctl32.c +++ b/drivers/media/video/v4l2-compat-ioctl32.c @@ -193,17 +193,24 @@ static int put_video_window32(struct video_window *kp, struct video_window32 __u struct video_code32 { char loadwhat[16]; /* name or tag of file being passed */ compat_int_t datasize; - unsigned char *data; + compat_uptr_t data; }; -static int get_microcode32(struct video_code *kp, struct video_code32 __user *up) +static struct video_code __user *get_microcode32(struct video_code32 *kp) { - if (!access_ok(VERIFY_READ, up, sizeof(struct video_code32)) || - copy_from_user(kp->loadwhat, up->loadwhat, sizeof(up->loadwhat)) || - get_user(kp->datasize, &up->datasize) || - copy_from_user(kp->data, up->data, up->datasize)) - return -EFAULT; - return 0; + struct video_code __user *up; + + up = compat_alloc_user_space(sizeof(*up)); + + /* + * NOTE! We don't actually care if these fail. If the + * user address is invalid, the native ioctl will do + * the error handling for us + */ + (void) copy_to_user(up->loadwhat, kp->loadwhat, sizeof(up->loadwhat)); + (void) put_user(kp->datasize, &up->datasize); + (void) put_user(compat_ptr(kp->data), &up->data); + return up; } #define VIDIOCGTUNER32 _IOWR('v', 4, struct video_tuner32) @@ -739,7 +746,7 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar struct video_tuner vt; struct video_buffer vb; struct video_window vw; - struct video_code vc; + struct video_code32 vc; struct video_audio va; #endif struct v4l2_format v2f; @@ -818,8 +825,11 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar break; case VIDIOCSMICROCODE: - err = get_microcode32(&karg.vc, up); - compatible_arg = 0; + /* Copy the 32-bit "video_code32" to kernel space */ + if (copy_from_user(&karg.vc, up, sizeof(karg.vc))) + return -EFAULT; + /* Convert the 32-bit version to a 64-bit version in user space */ + up = get_microcode32(&karg.vc); break; case VIDIOCSFREQ: diff --git a/drivers/media/video/videobuf-dma-contig.c b/drivers/media/video/videobuf-dma-contig.c index 372b87efcd05..6ff9e4bac3ea 100644 --- a/drivers/media/video/videobuf-dma-contig.c +++ b/drivers/media/video/videobuf-dma-contig.c @@ -393,8 +393,10 @@ void videobuf_dma_contig_free(struct videobuf_queue *q, } /* read() method */ - dma_free_coherent(q->dev, mem->size, mem->vaddr, mem->dma_handle); - mem->vaddr = NULL; + if (mem->vaddr) { + dma_free_coherent(q->dev, mem->size, mem->vaddr, mem->dma_handle); + mem->vaddr = NULL; + } } EXPORT_SYMBOL_GPL(videobuf_dma_contig_free); diff --git a/drivers/media/video/videobuf-dma-sg.c b/drivers/media/video/videobuf-dma-sg.c index 06f9a9c2a39a..2ad0bc252b0e 100644 --- a/drivers/media/video/videobuf-dma-sg.c +++ b/drivers/media/video/videobuf-dma-sg.c @@ -94,7 +94,7 @@ err: * must free the memory. */ static struct scatterlist *videobuf_pages_to_sg(struct page **pages, - int nr_pages, int offset) + int nr_pages, int offset, size_t size) { struct scatterlist *sglist; int i; @@ -110,12 +110,14 @@ static struct scatterlist *videobuf_pages_to_sg(struct page **pages, /* DMA to highmem pages might not work */ goto highmem; sg_set_page(&sglist[0], pages[0], PAGE_SIZE - offset, offset); + size -= PAGE_SIZE - offset; for (i = 1; i < nr_pages; i++) { if (NULL == pages[i]) goto nopage; if (PageHighMem(pages[i])) goto highmem; - sg_set_page(&sglist[i], pages[i], PAGE_SIZE, 0); + sg_set_page(&sglist[i], pages[i], min(PAGE_SIZE, size), 0); + size -= min(PAGE_SIZE, size); } return sglist; @@ -170,7 +172,8 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma, first = (data & PAGE_MASK) >> PAGE_SHIFT; last = ((data+size-1) & PAGE_MASK) >> PAGE_SHIFT; - dma->offset = data & ~PAGE_MASK; + dma->offset = data & ~PAGE_MASK; + dma->size = size; dma->nr_pages = last-first+1; dma->pages = kmalloc(dma->nr_pages * sizeof(struct page *), GFP_KERNEL); if (NULL == dma->pages) @@ -252,7 +255,7 @@ int videobuf_dma_map(struct device *dev, struct videobuf_dmabuf *dma) if (dma->pages) { dma->sglist = videobuf_pages_to_sg(dma->pages, dma->nr_pages, - dma->offset); + dma->offset, dma->size); } if (dma->vaddr) { dma->sglist = videobuf_vmalloc_to_sg(dma->vaddr, diff --git a/drivers/misc/bh1780gli.c b/drivers/misc/bh1780gli.c index 714c6b487313..d5f3a3fd2319 100644 --- a/drivers/misc/bh1780gli.c +++ b/drivers/misc/bh1780gli.c @@ -190,7 +190,6 @@ static int __devexit bh1780_remove(struct i2c_client *client) ddata = i2c_get_clientdata(client); sysfs_remove_group(&client->dev.kobj, &bh1780_attr_group); - i2c_set_clientdata(client, NULL); kfree(ddata); return 0; diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 5db49b124ffa..09eee6df0653 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -1631,6 +1631,19 @@ int mmc_suspend_host(struct mmc_host *host) if (host->bus_ops && !host->bus_dead) { if (host->bus_ops->suspend) err = host->bus_ops->suspend(host); + if (err == -ENOSYS || !host->bus_ops->resume) { + /* + * We simply "remove" the card in this case. + * It will be redetected on resume. + */ + if (host->bus_ops->remove) + host->bus_ops->remove(host); + mmc_claim_host(host); + mmc_detach_bus(host); + mmc_release_host(host); + host->pm_flags = 0; + err = 0; + } } mmc_bus_put(host); diff --git a/drivers/mtd/nand/mxc_nand.c b/drivers/mtd/nand/mxc_nand.c index b2828e84d243..214b03afdd48 100644 --- a/drivers/mtd/nand/mxc_nand.c +++ b/drivers/mtd/nand/mxc_nand.c @@ -30,6 +30,8 @@ #include <linux/clk.h> #include <linux/err.h> #include <linux/io.h> +#include <linux/irq.h> +#include <linux/completion.h> #include <asm/mach/flash.h> #include <mach/mxc_nand.h> @@ -151,7 +153,7 @@ struct mxc_nand_host { int irq; int eccsize; - wait_queue_head_t irq_waitq; + struct completion op_completion; uint8_t *data_buf; unsigned int buf_start; @@ -164,6 +166,7 @@ struct mxc_nand_host { void (*send_read_id)(struct mxc_nand_host *); uint16_t (*get_dev_status)(struct mxc_nand_host *); int (*check_int)(struct mxc_nand_host *); + void (*irq_control)(struct mxc_nand_host *, int); }; /* OOB placement block for use with hardware ecc generation */ @@ -216,9 +219,12 @@ static irqreturn_t mxc_nfc_irq(int irq, void *dev_id) { struct mxc_nand_host *host = dev_id; - disable_irq_nosync(irq); + if (!host->check_int(host)) + return IRQ_NONE; - wake_up(&host->irq_waitq); + host->irq_control(host, 0); + + complete(&host->op_completion); return IRQ_HANDLED; } @@ -245,11 +251,54 @@ static int check_int_v1_v2(struct mxc_nand_host *host) if (!(tmp & NFC_V1_V2_CONFIG2_INT)) return 0; - writew(tmp & ~NFC_V1_V2_CONFIG2_INT, NFC_V1_V2_CONFIG2); + if (!cpu_is_mx21()) + writew(tmp & ~NFC_V1_V2_CONFIG2_INT, NFC_V1_V2_CONFIG2); return 1; } +/* + * It has been observed that the i.MX21 cannot read the CONFIG2:INT bit + * if interrupts are masked (CONFIG1:INT_MSK is set). To handle this, the + * driver can enable/disable the irq line rather than simply masking the + * interrupts. + */ +static void irq_control_mx21(struct mxc_nand_host *host, int activate) +{ + if (activate) + enable_irq(host->irq); + else + disable_irq_nosync(host->irq); +} + +static void irq_control_v1_v2(struct mxc_nand_host *host, int activate) +{ + uint16_t tmp; + + tmp = readw(NFC_V1_V2_CONFIG1); + + if (activate) + tmp &= ~NFC_V1_V2_CONFIG1_INT_MSK; + else + tmp |= NFC_V1_V2_CONFIG1_INT_MSK; + + writew(tmp, NFC_V1_V2_CONFIG1); +} + +static void irq_control_v3(struct mxc_nand_host *host, int activate) +{ + uint32_t tmp; + + tmp = readl(NFC_V3_CONFIG2); + + if (activate) + tmp &= ~NFC_V3_CONFIG2_INT_MSK; + else + tmp |= NFC_V3_CONFIG2_INT_MSK; + + writel(tmp, NFC_V3_CONFIG2); +} + /* This function polls the NANDFC to wait for the basic operation to * complete by checking the INT bit of config2 register. */ @@ -259,10 +308,9 @@ static void wait_op_done(struct mxc_nand_host *host, int useirq) if (useirq) { if (!host->check_int(host)) { - - enable_irq(host->irq); - - wait_event(host->irq_waitq, host->check_int(host)); + INIT_COMPLETION(host->op_completion); + host->irq_control(host, 1); + wait_for_completion(&host->op_completion); } } else { while (max_retries-- > 0) { @@ -799,6 +847,7 @@ static void preset_v3(struct mtd_info *mtd) NFC_V3_CONFIG2_2CMD_PHASES | NFC_V3_CONFIG2_SPAS(mtd->oobsize >> 1) | NFC_V3_CONFIG2_ST_CMD(0x70) | + NFC_V3_CONFIG2_INT_MSK | NFC_V3_CONFIG2_NUM_ADDR_PHASE0; if (chip->ecc.mode == NAND_ECC_HW) @@ -1024,6 +1073,10 @@ static int __init mxcnd_probe(struct platform_device *pdev) host->send_read_id = send_read_id_v1_v2; host->get_dev_status = get_dev_status_v1_v2; host->check_int = check_int_v1_v2; + if (cpu_is_mx21()) + host->irq_control = irq_control_mx21; + else + host->irq_control = irq_control_v1_v2; } if (nfc_is_v21()) { @@ -1062,6 +1115,7 @@ static int __init mxcnd_probe(struct platform_device *pdev) host->send_read_id = send_read_id_v3; host->check_int = check_int_v3; host->get_dev_status = get_dev_status_v3; + host->irq_control = irq_control_v3; oob_smallpage = &nandv2_hw_eccoob_smallpage; oob_largepage = &nandv2_hw_eccoob_largepage; } else @@ -1093,14 +1147,34 @@ static int __init mxcnd_probe(struct platform_device *pdev) this->options |= NAND_USE_FLASH_BBT; } - init_waitqueue_head(&host->irq_waitq); + init_completion(&host->op_completion); host->irq = platform_get_irq(pdev, 0); + /* + * mask the interrupt. For i.MX21 explicitely call + * irq_control_v1_v2 to use the mask bit. We can't call + * disable_irq_nosync() for an interrupt we do not own yet. + */ + if (cpu_is_mx21()) + irq_control_v1_v2(host, 0); + else + host->irq_control(host, 0); + err = request_irq(host->irq, mxc_nfc_irq, IRQF_DISABLED, DRIVER_NAME, host); if (err) goto eirq; + host->irq_control(host, 0); + + /* + * Now that the interrupt is disabled make sure the interrupt + * mask bit is cleared on i.MX21. Otherwise we can't read + * the interrupt status bit on this machine. + */ + if (cpu_is_mx21()) + irq_control_v1_v2(host, 1); + /* first scan to find the device and get the page size */ if (nand_scan_ident(mtd, 1, NULL)) { err = -ENXIO; diff --git a/drivers/net/3c527.c b/drivers/net/3c527.c index 70705d1306b9..eca55c52bdfd 100644 --- a/drivers/net/3c527.c +++ b/drivers/net/3c527.c @@ -522,7 +522,7 @@ static int __init mc32_probe1(struct net_device *dev, int slot) lp->tx_len = lp->exec_box->data[9]; /* Transmit list count */ lp->rx_len = lp->exec_box->data[11]; /* Receive list count */ - init_MUTEX_LOCKED(&lp->cmd_mutex); + sema_init(&lp->cmd_mutex, 0); init_completion(&lp->execution_cmd); init_completion(&lp->xceiver_cmd); diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 2cc81a54cbf3..5db667c0b371 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -2428,7 +2428,7 @@ config UGETH_TX_ON_DEMAND config MV643XX_ETH tristate "Marvell Discovery (643XX) and Orion ethernet support" - depends on MV64X60 || PPC32 || PLAT_ORION + depends on (MV64X60 || PPC32 || PLAT_ORION) && INET select INET_LRO select PHYLIB help @@ -2803,7 +2803,7 @@ config NIU config PASEMI_MAC tristate "PA Semi 1/10Gbit MAC" - depends on PPC_PASEMI && PCI + depends on PPC_PASEMI && PCI && INET select PHYLIB select INET_LRO help diff --git a/drivers/net/b44.c b/drivers/net/b44.c index 1e620e287ae0..efeffdf9e5fa 100644 --- a/drivers/net/b44.c +++ b/drivers/net/b44.c @@ -2170,8 +2170,6 @@ static int __devinit b44_init_one(struct ssb_device *sdev, dev->irq = sdev->irq; SET_ETHTOOL_OPS(dev, &b44_ethtool_ops); - netif_carrier_off(dev); - err = ssb_bus_powerup(sdev->bus, 0); if (err) { dev_err(sdev->dev, @@ -2213,6 +2211,8 @@ static int __devinit b44_init_one(struct ssb_device *sdev, goto err_out_powerdown; } + netif_carrier_off(dev); + ssb_set_drvdata(sdev, dev); /* Chip reset provides power to the b44 MAC & PCI cores, which diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 3b16f62d5606..e953c6ad6e6d 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -5164,6 +5164,15 @@ int bond_create(struct net *net, const char *name) res = dev_alloc_name(bond_dev, "bond%d"); if (res < 0) goto out; + } else { + /* + * If we're given a name to register + * we need to ensure that its not already + * registered + */ + res = -EEXIST; + if (__dev_get_by_name(net, name) != NULL) + goto out; } res = register_netdevice(bond_dev); diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c index a333b42111b8..6372610ed240 100644 --- a/drivers/net/ehea/ehea_main.c +++ b/drivers/net/ehea/ehea_main.c @@ -533,8 +533,15 @@ static inline void ehea_fill_skb(struct net_device *dev, int length = cqe->num_bytes_transfered - 4; /*remove CRC */ skb_put(skb, length); - skb->ip_summed = CHECKSUM_UNNECESSARY; skb->protocol = eth_type_trans(skb, dev); + + /* The packet was not an IPV4 packet so a complemented checksum was + calculated. The value is found in the Internet Checksum field. */ + if (cqe->status & EHEA_CQE_BLIND_CKSUM) { + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum = csum_unfold(~cqe->inet_checksum_value); + } else + skb->ip_summed = CHECKSUM_UNNECESSARY; } static inline struct sk_buff *get_skb_by_index(struct sk_buff **skb_array, diff --git a/drivers/net/ehea/ehea_qmr.h b/drivers/net/ehea/ehea_qmr.h index f608a6c54af5..38104734a3be 100644 --- a/drivers/net/ehea/ehea_qmr.h +++ b/drivers/net/ehea/ehea_qmr.h @@ -150,6 +150,7 @@ struct ehea_rwqe { #define EHEA_CQE_TYPE_RQ 0x60 #define EHEA_CQE_STAT_ERR_MASK 0x700F #define EHEA_CQE_STAT_FAT_ERR_MASK 0xF +#define EHEA_CQE_BLIND_CKSUM 0x8000 #define EHEA_CQE_STAT_ERR_TCP 0x4000 #define EHEA_CQE_STAT_ERR_IP 0x2000 #define EHEA_CQE_STAT_ERR_CRC 0x1000 diff --git a/drivers/net/fec.c b/drivers/net/fec.c index 768b840aeb6b..cce32d43175f 100644 --- a/drivers/net/fec.c +++ b/drivers/net/fec.c @@ -678,24 +678,37 @@ static int fec_enet_mii_probe(struct net_device *dev) { struct fec_enet_private *fep = netdev_priv(dev); struct phy_device *phy_dev = NULL; - int ret; + char mdio_bus_id[MII_BUS_ID_SIZE]; + char phy_name[MII_BUS_ID_SIZE + 3]; + int phy_id; fep->phy_dev = NULL; - /* find the first phy */ - phy_dev = phy_find_first(fep->mii_bus); - if (!phy_dev) { - printk(KERN_ERR "%s: no PHY found\n", dev->name); - return -ENODEV; + /* check for attached phy */ + for (phy_id = 0; (phy_id < PHY_MAX_ADDR); phy_id++) { + if ((fep->mii_bus->phy_mask & (1 << phy_id))) + continue; + if (fep->mii_bus->phy_map[phy_id] == NULL) + continue; + if (fep->mii_bus->phy_map[phy_id]->phy_id == 0) + continue; + strncpy(mdio_bus_id, fep->mii_bus->id, MII_BUS_ID_SIZE); + break; } - /* attach the mac to the phy */ - ret = phy_connect_direct(dev, phy_dev, - &fec_enet_adjust_link, 0, - PHY_INTERFACE_MODE_MII); - if (ret) { - printk(KERN_ERR "%s: Could not attach to PHY\n", dev->name); - return ret; + if (phy_id >= PHY_MAX_ADDR) { + printk(KERN_INFO "%s: no PHY, assuming direct connection " + "to switch\n", dev->name); + strncpy(mdio_bus_id, "0", MII_BUS_ID_SIZE); + phy_id = 0; + } + + snprintf(phy_name, MII_BUS_ID_SIZE, PHY_ID_FMT, mdio_bus_id, phy_id); + phy_dev = phy_connect(dev, phy_name, &fec_enet_adjust_link, 0, + PHY_INTERFACE_MODE_MII); + if (IS_ERR(phy_dev)) { + printk(KERN_ERR "%s: could not attach to PHY\n", dev->name); + return PTR_ERR(phy_dev); } /* mask with MAC supported features */ @@ -738,7 +751,7 @@ static int fec_enet_mii_init(struct platform_device *pdev) fep->mii_bus->read = fec_enet_mdio_read; fep->mii_bus->write = fec_enet_mdio_write; fep->mii_bus->reset = fec_enet_mdio_reset; - snprintf(fep->mii_bus->id, MII_BUS_ID_SIZE, "%x", pdev->id); + snprintf(fep->mii_bus->id, MII_BUS_ID_SIZE, "%x", pdev->id + 1); fep->mii_bus->priv = fep; fep->mii_bus->parent = &pdev->dev; @@ -1311,6 +1324,9 @@ fec_probe(struct platform_device *pdev) if (ret) goto failed_mii_init; + /* Carrier starts down, phylib will bring it up */ + netif_carrier_off(ndev); + ret = register_netdev(ndev); if (ret) goto failed_register; diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 4b52c767ad05..3e5d0b6b6516 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -608,7 +608,7 @@ static int sixpack_open(struct tty_struct *tty) spin_lock_init(&sp->lock); atomic_set(&sp->refcnt, 1); - init_MUTEX_LOCKED(&sp->dead_sem); + sema_init(&sp->dead_sem, 0); /* !!! length of the buffers. MTU is IP MTU, not PACLEN! */ diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c index 66e88bd59caa..4c628393c8b1 100644 --- a/drivers/net/hamradio/mkiss.c +++ b/drivers/net/hamradio/mkiss.c @@ -747,7 +747,7 @@ static int mkiss_open(struct tty_struct *tty) spin_lock_init(&ax->buflock); atomic_set(&ax->refcnt, 1); - init_MUTEX_LOCKED(&ax->dead_sem); + sema_init(&ax->dead_sem, 0); ax->tty = tty; tty->disc_data = ax; diff --git a/drivers/net/irda/sir_dev.c b/drivers/net/irda/sir_dev.c index 1b051dab7b29..51d74447f8f8 100644 --- a/drivers/net/irda/sir_dev.c +++ b/drivers/net/irda/sir_dev.c @@ -909,7 +909,7 @@ struct sir_dev * sirdev_get_instance(const struct sir_driver *drv, const char *n dev->tx_skb = NULL; spin_lock_init(&dev->tx_lock); - init_MUTEX(&dev->fsm.sem); + sema_init(&dev->fsm.sem, 1); dev->drv = drv; dev->netdev = ndev; diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c index af50a530daee..78d70a6481bf 100644 --- a/drivers/net/ppp_async.c +++ b/drivers/net/ppp_async.c @@ -184,7 +184,7 @@ ppp_asynctty_open(struct tty_struct *tty) tasklet_init(&ap->tsk, ppp_async_process, (unsigned long) ap); atomic_set(&ap->refcnt, 1); - init_MUTEX_LOCKED(&ap->dead_sem); + sema_init(&ap->dead_sem, 0); ap->chan.private = ap; ap->chan.ops = &async_ops; diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c index a0da4a17b025..992db2fa136e 100644 --- a/drivers/net/r8169.c +++ b/drivers/net/r8169.c @@ -1212,7 +1212,8 @@ static void rtl8169_update_counters(struct net_device *dev) if ((RTL_R8(ChipCmd) & CmdRxEnb) == 0) return; - counters = pci_alloc_consistent(tp->pci_dev, sizeof(*counters), &paddr); + counters = dma_alloc_coherent(&tp->pci_dev->dev, sizeof(*counters), + &paddr, GFP_KERNEL); if (!counters) return; @@ -1233,7 +1234,8 @@ static void rtl8169_update_counters(struct net_device *dev) RTL_W32(CounterAddrLow, 0); RTL_W32(CounterAddrHigh, 0); - pci_free_consistent(tp->pci_dev, sizeof(*counters), counters, paddr); + dma_free_coherent(&tp->pci_dev->dev, sizeof(*counters), counters, + paddr); } static void rtl8169_get_ethtool_stats(struct net_device *dev, @@ -3292,15 +3294,15 @@ static int rtl8169_open(struct net_device *dev) /* * Rx and Tx desscriptors needs 256 bytes alignment. - * pci_alloc_consistent provides more. + * dma_alloc_coherent provides more. */ - tp->TxDescArray = pci_alloc_consistent(pdev, R8169_TX_RING_BYTES, - &tp->TxPhyAddr); + tp->TxDescArray = dma_alloc_coherent(&pdev->dev, R8169_TX_RING_BYTES, + &tp->TxPhyAddr, GFP_KERNEL); if (!tp->TxDescArray) goto err_pm_runtime_put; - tp->RxDescArray = pci_alloc_consistent(pdev, R8169_RX_RING_BYTES, - &tp->RxPhyAddr); + tp->RxDescArray = dma_alloc_coherent(&pdev->dev, R8169_RX_RING_BYTES, + &tp->RxPhyAddr, GFP_KERNEL); if (!tp->RxDescArray) goto err_free_tx_0; @@ -3334,12 +3336,12 @@ out: err_release_ring_2: rtl8169_rx_clear(tp); err_free_rx_1: - pci_free_consistent(pdev, R8169_RX_RING_BYTES, tp->RxDescArray, - tp->RxPhyAddr); + dma_free_coherent(&pdev->dev, R8169_RX_RING_BYTES, tp->RxDescArray, + tp->RxPhyAddr); tp->RxDescArray = NULL; err_free_tx_0: - pci_free_consistent(pdev, R8169_TX_RING_BYTES, tp->TxDescArray, - tp->TxPhyAddr); + dma_free_coherent(&pdev->dev, R8169_TX_RING_BYTES, tp->TxDescArray, + tp->TxPhyAddr); tp->TxDescArray = NULL; err_pm_runtime_put: pm_runtime_put_noidle(&pdev->dev); @@ -3975,7 +3977,7 @@ static void rtl8169_free_rx_skb(struct rtl8169_private *tp, { struct pci_dev *pdev = tp->pci_dev; - pci_unmap_single(pdev, le64_to_cpu(desc->addr), tp->rx_buf_sz, + dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), tp->rx_buf_sz, PCI_DMA_FROMDEVICE); dev_kfree_skb(*sk_buff); *sk_buff = NULL; @@ -4000,7 +4002,7 @@ static inline void rtl8169_map_to_asic(struct RxDesc *desc, dma_addr_t mapping, static struct sk_buff *rtl8169_alloc_rx_skb(struct pci_dev *pdev, struct net_device *dev, struct RxDesc *desc, int rx_buf_sz, - unsigned int align) + unsigned int align, gfp_t gfp) { struct sk_buff *skb; dma_addr_t mapping; @@ -4008,13 +4010,13 @@ static struct sk_buff *rtl8169_alloc_rx_skb(struct pci_dev *pdev, pad = align ? align : NET_IP_ALIGN; - skb = netdev_alloc_skb(dev, rx_buf_sz + pad); + skb = __netdev_alloc_skb(dev, rx_buf_sz + pad, gfp); if (!skb) goto err_out; skb_reserve(skb, align ? ((pad - 1) & (unsigned long)skb->data) : pad); - mapping = pci_map_single(pdev, skb->data, rx_buf_sz, + mapping = dma_map_single(&pdev->dev, skb->data, rx_buf_sz, PCI_DMA_FROMDEVICE); rtl8169_map_to_asic(desc, mapping, rx_buf_sz); @@ -4039,7 +4041,7 @@ static void rtl8169_rx_clear(struct rtl8169_private *tp) } static u32 rtl8169_rx_fill(struct rtl8169_private *tp, struct net_device *dev, - u32 start, u32 end) + u32 start, u32 end, gfp_t gfp) { u32 cur; @@ -4054,7 +4056,7 @@ static u32 rtl8169_rx_fill(struct rtl8169_private *tp, struct net_device *dev, skb = rtl8169_alloc_rx_skb(tp->pci_dev, dev, tp->RxDescArray + i, - tp->rx_buf_sz, tp->align); + tp->rx_buf_sz, tp->align, gfp); if (!skb) break; @@ -4082,7 +4084,7 @@ static int rtl8169_init_ring(struct net_device *dev) memset(tp->tx_skb, 0x0, NUM_TX_DESC * sizeof(struct ring_info)); memset(tp->Rx_skbuff, 0x0, NUM_RX_DESC * sizeof(struct sk_buff *)); - if (rtl8169_rx_fill(tp, dev, 0, NUM_RX_DESC) != NUM_RX_DESC) + if (rtl8169_rx_fill(tp, dev, 0, NUM_RX_DESC, GFP_KERNEL) != NUM_RX_DESC) goto err_out; rtl8169_mark_as_last_descriptor(tp->RxDescArray + NUM_RX_DESC - 1); @@ -4099,7 +4101,8 @@ static void rtl8169_unmap_tx_skb(struct pci_dev *pdev, struct ring_info *tx_skb, { unsigned int len = tx_skb->len; - pci_unmap_single(pdev, le64_to_cpu(desc->addr), len, PCI_DMA_TODEVICE); + dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), len, + PCI_DMA_TODEVICE); desc->opts1 = 0x00; desc->opts2 = 0x00; desc->addr = 0x00; @@ -4243,7 +4246,8 @@ static int rtl8169_xmit_frags(struct rtl8169_private *tp, struct sk_buff *skb, txd = tp->TxDescArray + entry; len = frag->size; addr = ((void *) page_address(frag->page)) + frag->page_offset; - mapping = pci_map_single(tp->pci_dev, addr, len, PCI_DMA_TODEVICE); + mapping = dma_map_single(&tp->pci_dev->dev, addr, len, + PCI_DMA_TODEVICE); /* anti gcc 2.95.3 bugware (sic) */ status = opts1 | len | (RingEnd * !((entry + 1) % NUM_TX_DESC)); @@ -4313,7 +4317,8 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb, tp->tx_skb[entry].skb = skb; } - mapping = pci_map_single(tp->pci_dev, skb->data, len, PCI_DMA_TODEVICE); + mapping = dma_map_single(&tp->pci_dev->dev, skb->data, len, + PCI_DMA_TODEVICE); tp->tx_skb[entry].len = len; txd->addr = cpu_to_le64(mapping); @@ -4477,8 +4482,8 @@ static inline bool rtl8169_try_rx_copy(struct sk_buff **sk_buff, if (!skb) goto out; - pci_dma_sync_single_for_cpu(tp->pci_dev, addr, pkt_size, - PCI_DMA_FROMDEVICE); + dma_sync_single_for_cpu(&tp->pci_dev->dev, addr, pkt_size, + PCI_DMA_FROMDEVICE); skb_copy_from_linear_data(*sk_buff, skb->data, pkt_size); *sk_buff = skb; done = true; @@ -4549,11 +4554,11 @@ static int rtl8169_rx_interrupt(struct net_device *dev, rtl8169_rx_csum(skb, desc); if (rtl8169_try_rx_copy(&skb, tp, pkt_size, addr)) { - pci_dma_sync_single_for_device(pdev, addr, + dma_sync_single_for_device(&pdev->dev, addr, pkt_size, PCI_DMA_FROMDEVICE); rtl8169_mark_to_asic(desc, tp->rx_buf_sz); } else { - pci_unmap_single(pdev, addr, tp->rx_buf_sz, + dma_unmap_single(&pdev->dev, addr, tp->rx_buf_sz, PCI_DMA_FROMDEVICE); tp->Rx_skbuff[entry] = NULL; } @@ -4583,7 +4588,7 @@ static int rtl8169_rx_interrupt(struct net_device *dev, count = cur_rx - tp->cur_rx; tp->cur_rx = cur_rx; - delta = rtl8169_rx_fill(tp, dev, tp->dirty_rx, tp->cur_rx); + delta = rtl8169_rx_fill(tp, dev, tp->dirty_rx, tp->cur_rx, GFP_ATOMIC); if (!delta && count) netif_info(tp, intr, dev, "no Rx buffer allocated\n"); tp->dirty_rx += delta; @@ -4769,10 +4774,10 @@ static int rtl8169_close(struct net_device *dev) free_irq(dev->irq, dev); - pci_free_consistent(pdev, R8169_RX_RING_BYTES, tp->RxDescArray, - tp->RxPhyAddr); - pci_free_consistent(pdev, R8169_TX_RING_BYTES, tp->TxDescArray, - tp->TxPhyAddr); + dma_free_coherent(&pdev->dev, R8169_RX_RING_BYTES, tp->RxDescArray, + tp->RxPhyAddr); + dma_free_coherent(&pdev->dev, R8169_TX_RING_BYTES, tp->TxDescArray, + tp->TxPhyAddr); tp->TxDescArray = NULL; tp->RxDescArray = NULL; diff --git a/drivers/net/skge.c b/drivers/net/skge.c index 40e5c46e7571..465ae7e84507 100644 --- a/drivers/net/skge.c +++ b/drivers/net/skge.c @@ -43,6 +43,7 @@ #include <linux/seq_file.h> #include <linux/mii.h> #include <linux/slab.h> +#include <linux/dmi.h> #include <asm/irq.h> #include "skge.h" @@ -3868,6 +3869,8 @@ static void __devinit skge_show_addr(struct net_device *dev) netif_info(skge, probe, skge->netdev, "addr %pM\n", dev->dev_addr); } +static int only_32bit_dma; + static int __devinit skge_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { @@ -3889,7 +3892,7 @@ static int __devinit skge_probe(struct pci_dev *pdev, pci_set_master(pdev); - if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) { + if (!only_32bit_dma && !pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) { using_dac = 1; err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); } else if (!(err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)))) { @@ -4147,8 +4150,21 @@ static struct pci_driver skge_driver = { .shutdown = skge_shutdown, }; +static struct dmi_system_id skge_32bit_dma_boards[] = { + { + .ident = "Gigabyte nForce boards", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Gigabyte Technology Co"), + DMI_MATCH(DMI_BOARD_NAME, "nForce"), + }, + }, + {} +}; + static int __init skge_init_module(void) { + if (dmi_check_system(skge_32bit_dma_boards)) + only_32bit_dma = 1; skge_debug_init(); return pci_register_driver(&skge_driver); } diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index bc3af78a869f..1ec4b9e0239a 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -4666,7 +4666,7 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget) desc_idx, *post_ptr); drop_it_no_recycle: /* Other statistics kept track of by card. */ - tp->net_stats.rx_dropped++; + tp->rx_dropped++; goto next_pkt; } @@ -4726,7 +4726,7 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget) if (len > (tp->dev->mtu + ETH_HLEN) && skb->protocol != htons(ETH_P_8021Q)) { dev_kfree_skb(skb); - goto next_pkt; + goto drop_it_no_recycle; } if (desc->type_flags & RXD_FLAG_VLAN && @@ -9240,6 +9240,8 @@ static struct rtnl_link_stats64 *tg3_get_stats64(struct net_device *dev, stats->rx_missed_errors = old_stats->rx_missed_errors + get_stat64(&hw_stats->rx_discards); + stats->rx_dropped = tp->rx_dropped; + return stats; } diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h index 4937bd190964..be7ff138a7f9 100644 --- a/drivers/net/tg3.h +++ b/drivers/net/tg3.h @@ -2759,7 +2759,7 @@ struct tg3 { /* begin "everything else" cacheline(s) section */ - struct rtnl_link_stats64 net_stats; + unsigned long rx_dropped; struct rtnl_link_stats64 net_stats_prev; struct tg3_ethtool_stats estats; struct tg3_ethtool_stats estats_prev; diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c index 04c6cd4333f1..10bafd59f9c3 100644 --- a/drivers/net/wan/cosa.c +++ b/drivers/net/wan/cosa.c @@ -575,7 +575,7 @@ static int cosa_probe(int base, int irq, int dma) /* Initialize the chardev data structures */ mutex_init(&chan->rlock); - init_MUTEX(&chan->wsem); + sema_init(&chan->wsem, 1); /* Register the network interface */ if (!(chan->netdev = alloc_hdlcdev(chan))) { diff --git a/drivers/net/wimax/i2400m/rx.c b/drivers/net/wimax/i2400m/rx.c index 8cc9e319f435..1737d1488b35 100644 --- a/drivers/net/wimax/i2400m/rx.c +++ b/drivers/net/wimax/i2400m/rx.c @@ -1244,16 +1244,16 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb) int i, result; struct device *dev = i2400m_dev(i2400m); const struct i2400m_msg_hdr *msg_hdr; - size_t pl_itr, pl_size, skb_len; + size_t pl_itr, pl_size; unsigned long flags; - unsigned num_pls, single_last; + unsigned num_pls, single_last, skb_len; skb_len = skb->len; - d_fnstart(4, dev, "(i2400m %p skb %p [size %zu])\n", + d_fnstart(4, dev, "(i2400m %p skb %p [size %u])\n", i2400m, skb, skb_len); result = -EIO; msg_hdr = (void *) skb->data; - result = i2400m_rx_msg_hdr_check(i2400m, msg_hdr, skb->len); + result = i2400m_rx_msg_hdr_check(i2400m, msg_hdr, skb_len); if (result < 0) goto error_msg_hdr_check; result = -EIO; @@ -1261,10 +1261,10 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb) pl_itr = sizeof(*msg_hdr) + /* Check payload descriptor(s) */ num_pls * sizeof(msg_hdr->pld[0]); pl_itr = ALIGN(pl_itr, I2400M_PL_ALIGN); - if (pl_itr > skb->len) { /* got all the payload descriptors? */ + if (pl_itr > skb_len) { /* got all the payload descriptors? */ dev_err(dev, "RX: HW BUG? message too short (%u bytes) for " "%u payload descriptors (%zu each, total %zu)\n", - skb->len, num_pls, sizeof(msg_hdr->pld[0]), pl_itr); + skb_len, num_pls, sizeof(msg_hdr->pld[0]), pl_itr); goto error_pl_descr_short; } /* Walk each payload payload--check we really got it */ @@ -1272,7 +1272,7 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb) /* work around old gcc warnings */ pl_size = i2400m_pld_size(&msg_hdr->pld[i]); result = i2400m_rx_pl_descr_check(i2400m, &msg_hdr->pld[i], - pl_itr, skb->len); + pl_itr, skb_len); if (result < 0) goto error_pl_descr_check; single_last = num_pls == 1 || i == num_pls - 1; @@ -1290,16 +1290,16 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb) if (i < i2400m->rx_pl_min) i2400m->rx_pl_min = i; i2400m->rx_num++; - i2400m->rx_size_acc += skb->len; - if (skb->len < i2400m->rx_size_min) - i2400m->rx_size_min = skb->len; - if (skb->len > i2400m->rx_size_max) - i2400m->rx_size_max = skb->len; + i2400m->rx_size_acc += skb_len; + if (skb_len < i2400m->rx_size_min) + i2400m->rx_size_min = skb_len; + if (skb_len > i2400m->rx_size_max) + i2400m->rx_size_max = skb_len; spin_unlock_irqrestore(&i2400m->rx_lock, flags); error_pl_descr_check: error_pl_descr_short: error_msg_hdr_check: - d_fnend(4, dev, "(i2400m %p skb %p [size %zu]) = %d\n", + d_fnend(4, dev, "(i2400m %p skb %p [size %u]) = %d\n", i2400m, skb, skb_len, result); return result; } diff --git a/drivers/net/wireless/ath/ath9k/ani.c b/drivers/net/wireless/ath/ath9k/ani.c index cc648b6ae31c..a3d95cca8f0c 100644 --- a/drivers/net/wireless/ath/ath9k/ani.c +++ b/drivers/net/wireless/ath/ath9k/ani.c @@ -543,7 +543,7 @@ static u8 ath9k_hw_chan_2_clockrate_mhz(struct ath_hw *ah) if (conf_is_ht40(conf)) return clockrate * 2; - return clockrate * 2; + return clockrate; } static int32_t ath9k_hw_ani_get_listen_time(struct ath_hw *ah) diff --git a/drivers/parport/share.c b/drivers/parport/share.c index dffa5d4fb298..a2d9d1e59260 100644 --- a/drivers/parport/share.c +++ b/drivers/parport/share.c @@ -306,7 +306,7 @@ struct parport *parport_register_port(unsigned long base, int irq, int dma, spin_lock_init(&tmp->pardevice_lock); tmp->ieee1284.mode = IEEE1284_MODE_COMPAT; tmp->ieee1284.phase = IEEE1284_PH_FWD_IDLE; - init_MUTEX_LOCKED (&tmp->ieee1284.irq); /* actually a semaphore at 0 */ + sema_init(&tmp->ieee1284.irq, 0); tmp->spintime = parport_default_spintime; atomic_set (&tmp->ref_count, 1); INIT_LIST_HEAD(&tmp->full_list); diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c index 9024480a8228..c44a5e8b8b82 100644 --- a/drivers/platform/x86/intel_ips.c +++ b/drivers/platform/x86/intel_ips.c @@ -51,7 +51,6 @@ * TODO: * - handle CPU hotplug * - provide turbo enable/disable api - * - make sure we can write turbo enable/disable reg based on MISC_EN * * Related documents: * - CDI 403777, 403778 - Auburndale EDS vol 1 & 2 @@ -230,7 +229,7 @@ #define THM_TC2 0xac #define THM_DTV 0xb0 #define THM_ITV 0xd8 -#define ITV_ME_SEQNO_MASK 0x000f0000 /* ME should update every ~200ms */ +#define ITV_ME_SEQNO_MASK 0x00ff0000 /* ME should update every ~200ms */ #define ITV_ME_SEQNO_SHIFT (16) #define ITV_MCH_TEMP_MASK 0x0000ff00 #define ITV_MCH_TEMP_SHIFT (8) @@ -325,6 +324,7 @@ struct ips_driver { bool gpu_preferred; bool poll_turbo_status; bool second_cpu; + bool turbo_toggle_allowed; struct ips_mcp_limits *limits; /* Optional MCH interfaces for if i915 is in use */ @@ -415,7 +415,7 @@ static void ips_cpu_lower(struct ips_driver *ips) new_limit = cur_limit - 8; /* 1W decrease */ /* Clamp to SKU TDP limit */ - if (((new_limit * 10) / 8) < (ips->orig_turbo_limit & TURBO_TDP_MASK)) + if (new_limit < (ips->orig_turbo_limit & TURBO_TDP_MASK)) new_limit = ips->orig_turbo_limit & TURBO_TDP_MASK; thm_writew(THM_MPCPC, (new_limit * 10) / 8); @@ -461,7 +461,8 @@ static void ips_enable_cpu_turbo(struct ips_driver *ips) if (ips->__cpu_turbo_on) return; - on_each_cpu(do_enable_cpu_turbo, ips, 1); + if (ips->turbo_toggle_allowed) + on_each_cpu(do_enable_cpu_turbo, ips, 1); ips->__cpu_turbo_on = true; } @@ -498,7 +499,8 @@ static void ips_disable_cpu_turbo(struct ips_driver *ips) if (!ips->__cpu_turbo_on) return; - on_each_cpu(do_disable_cpu_turbo, ips, 1); + if (ips->turbo_toggle_allowed) + on_each_cpu(do_disable_cpu_turbo, ips, 1); ips->__cpu_turbo_on = false; } @@ -598,17 +600,29 @@ static bool mcp_exceeded(struct ips_driver *ips) { unsigned long flags; bool ret = false; + u32 temp_limit; + u32 avg_power; + const char *msg = "MCP limit exceeded: "; spin_lock_irqsave(&ips->turbo_status_lock, flags); - if (ips->mcp_avg_temp > (ips->mcp_temp_limit * 100)) - ret = true; - if (ips->cpu_avg_power + ips->mch_avg_power > ips->mcp_power_limit) + + temp_limit = ips->mcp_temp_limit * 100; + if (ips->mcp_avg_temp > temp_limit) { + dev_info(&ips->dev->dev, + "%sAvg temp %u, limit %u\n", msg, ips->mcp_avg_temp, + temp_limit); ret = true; - spin_unlock_irqrestore(&ips->turbo_status_lock, flags); + } - if (ret) + avg_power = ips->cpu_avg_power + ips->mch_avg_power; + if (avg_power > ips->mcp_power_limit) { dev_info(&ips->dev->dev, - "MCP power or thermal limit exceeded\n"); + "%sAvg power %u, limit %u\n", msg, avg_power, + ips->mcp_power_limit); + ret = true; + } + + spin_unlock_irqrestore(&ips->turbo_status_lock, flags); return ret; } @@ -663,6 +677,27 @@ static bool mch_exceeded(struct ips_driver *ips) } /** + * verify_limits - verify BIOS provided limits + * @ips: IPS structure + * + * BIOS can optionally provide non-default limits for power and temp. Check + * them here and use the defaults if the BIOS values are not provided or + * are otherwise unusable. + */ +static void verify_limits(struct ips_driver *ips) +{ + if (ips->mcp_power_limit < ips->limits->mcp_power_limit || + ips->mcp_power_limit > 35000) + ips->mcp_power_limit = ips->limits->mcp_power_limit; + + if (ips->mcp_temp_limit < ips->limits->core_temp_limit || + ips->mcp_temp_limit < ips->limits->mch_temp_limit || + ips->mcp_temp_limit > 150) + ips->mcp_temp_limit = min(ips->limits->core_temp_limit, + ips->limits->mch_temp_limit); +} + +/** * update_turbo_limits - get various limits & settings from regs * @ips: IPS driver struct * @@ -680,12 +715,21 @@ static void update_turbo_limits(struct ips_driver *ips) u32 hts = thm_readl(THM_HTS); ips->cpu_turbo_enabled = !(hts & HTS_PCTD_DIS); - ips->gpu_turbo_enabled = !(hts & HTS_GTD_DIS); + /* + * Disable turbo for now, until we can figure out why the power figures + * are wrong + */ + ips->cpu_turbo_enabled = false; + + if (ips->gpu_busy) + ips->gpu_turbo_enabled = !(hts & HTS_GTD_DIS); + ips->core_power_limit = thm_readw(THM_MPCPC); ips->mch_power_limit = thm_readw(THM_MMGPC); ips->mcp_temp_limit = thm_readw(THM_PTL); ips->mcp_power_limit = thm_readw(THM_MPPC); + verify_limits(ips); /* Ignore BIOS CPU vs GPU pref */ } @@ -858,7 +902,7 @@ static u32 get_cpu_power(struct ips_driver *ips, u32 *last, int period) ret = (ret * 1000) / 65535; *last = val; - return ret; + return 0; } static const u16 temp_decay_factor = 2; @@ -940,7 +984,6 @@ static int ips_monitor(void *data) kfree(mch_samples); kfree(cpu_samples); kfree(mchp_samples); - kthread_stop(ips->adjust); return -ENOMEM; } @@ -948,7 +991,7 @@ static int ips_monitor(void *data) ITV_ME_SEQNO_SHIFT; seqno_timestamp = get_jiffies_64(); - old_cpu_power = thm_readl(THM_CEC) / 65535; + old_cpu_power = thm_readl(THM_CEC); schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); /* Collect an initial average */ @@ -1150,11 +1193,18 @@ static irqreturn_t ips_irq_handler(int irq, void *arg) STS_GPL_SHIFT; /* ignore EC CPU vs GPU pref */ ips->cpu_turbo_enabled = !(sts & STS_PCTD_DIS); - ips->gpu_turbo_enabled = !(sts & STS_GTD_DIS); + /* + * Disable turbo for now, until we can figure + * out why the power figures are wrong + */ + ips->cpu_turbo_enabled = false; + if (ips->gpu_busy) + ips->gpu_turbo_enabled = !(sts & STS_GTD_DIS); ips->mcp_temp_limit = (sts & STS_PTL_MASK) >> STS_PTL_SHIFT; ips->mcp_power_limit = (tc1 & STS_PPL_MASK) >> STS_PPL_SHIFT; + verify_limits(ips); spin_unlock(&ips->turbo_status_lock); thm_writeb(THM_SEC, SEC_ACK); @@ -1333,8 +1383,10 @@ static struct ips_mcp_limits *ips_detect_cpu(struct ips_driver *ips) * turbo manually or we'll get an illegal MSR access, even though * turbo will still be available. */ - if (!(misc_en & IA32_MISC_TURBO_EN)) - ; /* add turbo MSR write allowed flag if necessary */ + if (misc_en & IA32_MISC_TURBO_EN) + ips->turbo_toggle_allowed = true; + else + ips->turbo_toggle_allowed = false; if (strstr(boot_cpu_data.x86_model_id, "CPU M")) limits = &ips_sv_limits; @@ -1351,9 +1403,10 @@ static struct ips_mcp_limits *ips_detect_cpu(struct ips_driver *ips) tdp = turbo_power & TURBO_TDP_MASK; /* Sanity check TDP against CPU */ - if (limits->mcp_power_limit != (tdp / 8) * 1000) { - dev_warn(&ips->dev->dev, "Warning: CPU TDP doesn't match expected value (found %d, expected %d)\n", - tdp / 8, limits->mcp_power_limit / 1000); + if (limits->core_power_limit != (tdp / 8) * 1000) { + dev_info(&ips->dev->dev, "CPU TDP doesn't match expected value (found %d, expected %d)\n", + tdp / 8, limits->core_power_limit / 1000); + limits->core_power_limit = (tdp / 8) * 1000; } out: @@ -1390,7 +1443,7 @@ static bool ips_get_i915_syms(struct ips_driver *ips) return true; out_put_busy: - symbol_put(i915_gpu_turbo_disable); + symbol_put(i915_gpu_busy); out_put_lower: symbol_put(i915_gpu_lower); out_put_raise: @@ -1532,22 +1585,27 @@ static int ips_probe(struct pci_dev *dev, const struct pci_device_id *id) /* Save turbo limits & ratios */ rdmsrl(TURBO_POWER_CURRENT_LIMIT, ips->orig_turbo_limit); - ips_enable_cpu_turbo(ips); - ips->cpu_turbo_enabled = true; + ips_disable_cpu_turbo(ips); + ips->cpu_turbo_enabled = false; - /* Set up the work queue and monitor/adjust threads */ - ips->monitor = kthread_run(ips_monitor, ips, "ips-monitor"); - if (IS_ERR(ips->monitor)) { + /* Create thermal adjust thread */ + ips->adjust = kthread_create(ips_adjust, ips, "ips-adjust"); + if (IS_ERR(ips->adjust)) { dev_err(&dev->dev, - "failed to create thermal monitor thread, aborting\n"); + "failed to create thermal adjust thread, aborting\n"); ret = -ENOMEM; goto error_free_irq; + } - ips->adjust = kthread_create(ips_adjust, ips, "ips-adjust"); - if (IS_ERR(ips->adjust)) { + /* + * Set up the work queue and monitor thread. The monitor thread + * will wake up ips_adjust thread. + */ + ips->monitor = kthread_run(ips_monitor, ips, "ips-monitor"); + if (IS_ERR(ips->monitor)) { dev_err(&dev->dev, - "failed to create thermal adjust thread, aborting\n"); + "failed to create thermal monitor thread, aborting\n"); ret = -ENOMEM; goto error_thread_cleanup; } @@ -1566,7 +1624,7 @@ static int ips_probe(struct pci_dev *dev, const struct pci_device_id *id) return ret; error_thread_cleanup: - kthread_stop(ips->monitor); + kthread_stop(ips->adjust); error_free_irq: free_irq(ips->dev->irq, ips); error_unmap: diff --git a/drivers/regulator/ad5398.c b/drivers/regulator/ad5398.c index df1fb53c09d2..a4be41614eeb 100644 --- a/drivers/regulator/ad5398.c +++ b/drivers/regulator/ad5398.c @@ -256,7 +256,6 @@ static int __devexit ad5398_remove(struct i2c_client *client) regulator_unregister(chip->rdev); kfree(chip); - i2c_set_clientdata(client, NULL); return 0; } diff --git a/drivers/regulator/isl6271a-regulator.c b/drivers/regulator/isl6271a-regulator.c index d61ecb885a8c..b8cc6389a541 100644 --- a/drivers/regulator/isl6271a-regulator.c +++ b/drivers/regulator/isl6271a-regulator.c @@ -191,8 +191,6 @@ static int __devexit isl6271a_remove(struct i2c_client *i2c) struct isl_pmic *pmic = i2c_get_clientdata(i2c); int i; - i2c_set_clientdata(i2c, NULL); - for (i = 0; i < 3; i++) regulator_unregister(pmic->rdev[i]); diff --git a/drivers/rtc/rtc-ds3232.c b/drivers/rtc/rtc-ds3232.c index 9daed8db83d3..9de8516e3531 100644 --- a/drivers/rtc/rtc-ds3232.c +++ b/drivers/rtc/rtc-ds3232.c @@ -268,7 +268,6 @@ out_irq: free_irq(client->irq, client); out_free: - i2c_set_clientdata(client, NULL); kfree(ds3232); return ret; } @@ -287,7 +286,6 @@ static int __devexit ds3232_remove(struct i2c_client *client) } rtc_device_unregister(ds3232->rtc); - i2c_set_clientdata(client, NULL); kfree(ds3232); return 0; } diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index ad0ed212db4a..348fba0a8976 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -1046,13 +1046,13 @@ int scsi_get_vpd_page(struct scsi_device *sdev, u8 page, unsigned char *buf, /* If the user actually wanted this page, we can skip the rest */ if (page == 0) - return -EINVAL; + return 0; for (i = 0; i < min((int)buf[3], buf_len - 4); i++) if (buf[i + 4] == page) goto found; - if (i < buf[3] && i > buf_len) + if (i < buf[3] && i >= buf_len - 4) /* ran off the end of the buffer, give us benefit of doubt */ goto found; /* The device claims it doesn't support the requested page */ diff --git a/drivers/serial/ioc3_serial.c b/drivers/serial/ioc3_serial.c index 93de907b1208..800c54602339 100644 --- a/drivers/serial/ioc3_serial.c +++ b/drivers/serial/ioc3_serial.c @@ -2044,6 +2044,7 @@ ioc3uart_probe(struct ioc3_submodule *is, struct ioc3_driver_data *idd) if (!port) { printk(KERN_WARNING "IOC3 serial memory not available for port\n"); + ret = -ENOMEM; goto out4; } spin_lock_init(&port->ip_lock); diff --git a/drivers/staging/tm6000/Kconfig b/drivers/staging/tm6000/Kconfig index c725356cc346..de7ebb99d8f6 100644 --- a/drivers/staging/tm6000/Kconfig +++ b/drivers/staging/tm6000/Kconfig @@ -1,6 +1,6 @@ config VIDEO_TM6000 tristate "TV Master TM5600/6000/6010 driver" - depends on VIDEO_DEV && I2C && INPUT && USB && EXPERIMENTAL + depends on VIDEO_DEV && I2C && INPUT && IR_CORE && USB && EXPERIMENTAL select VIDEO_TUNER select MEDIA_TUNER_XC2028 select MEDIA_TUNER_XC5000 diff --git a/drivers/staging/tm6000/tm6000-input.c b/drivers/staging/tm6000/tm6000-input.c index 32f7a0af6938..54f7667cc706 100644 --- a/drivers/staging/tm6000/tm6000-input.c +++ b/drivers/staging/tm6000/tm6000-input.c @@ -46,7 +46,7 @@ MODULE_PARM_DESC(enable_ir, "enable ir (default is enable"); } struct tm6000_ir_poll_result { - u8 rc_data[4]; + u16 rc_data; }; struct tm6000_IR { @@ -60,9 +60,9 @@ struct tm6000_IR { int polling; struct delayed_work work; u8 wait:1; + u8 key:1; struct urb *int_urb; u8 *urb_data; - u8 key:1; int (*get_key) (struct tm6000_IR *, struct tm6000_ir_poll_result *); @@ -122,13 +122,14 @@ static void tm6000_ir_urb_received(struct urb *urb) if (urb->status != 0) printk(KERN_INFO "not ready\n"); - else if (urb->actual_length > 0) + else if (urb->actual_length > 0) { memcpy(ir->urb_data, urb->transfer_buffer, urb->actual_length); - dprintk("data %02x %02x %02x %02x\n", ir->urb_data[0], - ir->urb_data[1], ir->urb_data[2], ir->urb_data[3]); + dprintk("data %02x %02x %02x %02x\n", ir->urb_data[0], + ir->urb_data[1], ir->urb_data[2], ir->urb_data[3]); - ir->key = 1; + ir->key = 1; + } rc = usb_submit_urb(urb, GFP_ATOMIC); } @@ -140,30 +141,47 @@ static int default_polling_getkey(struct tm6000_IR *ir, int rc; u8 buf[2]; - if (ir->wait && !&dev->int_in) { - poll_result->rc_data[0] = 0xff; + if (ir->wait && !&dev->int_in) return 0; - } if (&dev->int_in) { - poll_result->rc_data[0] = ir->urb_data[0]; - poll_result->rc_data[1] = ir->urb_data[1]; + if (ir->ir.ir_type == IR_TYPE_RC5) + poll_result->rc_data = ir->urb_data[0]; + else + poll_result->rc_data = ir->urb_data[0] | ir->urb_data[1] << 8; } else { tm6000_set_reg(dev, REQ_04_EN_DISABLE_MCU_INT, 2, 0); msleep(10); tm6000_set_reg(dev, REQ_04_EN_DISABLE_MCU_INT, 2, 1); msleep(10); - rc = tm6000_read_write_usb(dev, USB_DIR_IN | USB_TYPE_VENDOR | - USB_RECIP_DEVICE, REQ_02_GET_IR_CODE, 0, 0, buf, 1); + if (ir->ir.ir_type == IR_TYPE_RC5) { + rc = tm6000_read_write_usb(dev, USB_DIR_IN | + USB_TYPE_VENDOR | USB_RECIP_DEVICE, + REQ_02_GET_IR_CODE, 0, 0, buf, 1); - msleep(10); + msleep(10); - dprintk("read data=%02x\n", buf[0]); - if (rc < 0) - return rc; + dprintk("read data=%02x\n", buf[0]); + if (rc < 0) + return rc; - poll_result->rc_data[0] = buf[0]; + poll_result->rc_data = buf[0]; + } else { + rc = tm6000_read_write_usb(dev, USB_DIR_IN | + USB_TYPE_VENDOR | USB_RECIP_DEVICE, + REQ_02_GET_IR_CODE, 0, 0, buf, 2); + + msleep(10); + + dprintk("read data=%04x\n", buf[0] | buf[1] << 8); + if (rc < 0) + return rc; + + poll_result->rc_data = buf[0] | buf[1] << 8; + } + if ((poll_result->rc_data & 0x00ff) != 0xff) + ir->key = 1; } return 0; } @@ -180,12 +198,11 @@ static void tm6000_ir_handle_key(struct tm6000_IR *ir) return; } - dprintk("ir->get_key result data=%02x %02x\n", - poll_result.rc_data[0], poll_result.rc_data[1]); + dprintk("ir->get_key result data=%04x\n", poll_result.rc_data); - if (poll_result.rc_data[0] != 0xff && ir->key == 1) { + if (ir->key) { ir_input_keydown(ir->input->input_dev, &ir->ir, - poll_result.rc_data[0] | poll_result.rc_data[1] << 8); + (u32)poll_result.rc_data); ir_input_nokey(ir->input->input_dev, &ir->ir); ir->key = 0; diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 7c8008225ee3..17927b1f9334 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -127,7 +127,10 @@ static void handle_tx(struct vhost_net *net) size_t len, total_len = 0; int err, wmem; size_t hdr_size; - struct socket *sock = rcu_dereference(vq->private_data); + struct socket *sock; + + sock = rcu_dereference_check(vq->private_data, + lockdep_is_held(&vq->mutex)); if (!sock) return; @@ -582,7 +585,10 @@ static void vhost_net_disable_vq(struct vhost_net *n, static void vhost_net_enable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { - struct socket *sock = vq->private_data; + struct socket *sock; + + sock = rcu_dereference_protected(vq->private_data, + lockdep_is_held(&vq->mutex)); if (!sock) return; if (vq == n->vqs + VHOST_NET_VQ_TX) { @@ -598,7 +604,8 @@ static struct socket *vhost_net_stop_vq(struct vhost_net *n, struct socket *sock; mutex_lock(&vq->mutex); - sock = vq->private_data; + sock = rcu_dereference_protected(vq->private_data, + lockdep_is_held(&vq->mutex)); vhost_net_disable_vq(n, vq); rcu_assign_pointer(vq->private_data, NULL); mutex_unlock(&vq->mutex); @@ -736,7 +743,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) } /* start polling new socket */ - oldsock = vq->private_data; + oldsock = rcu_dereference_protected(vq->private_data, + lockdep_is_held(&vq->mutex)); if (sock != oldsock) { vhost_net_disable_vq(n, vq); rcu_assign_pointer(vq->private_data, sock); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index dd3d6f7406f8..8b5a1b33d0fe 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -320,7 +320,7 @@ long vhost_dev_reset_owner(struct vhost_dev *dev) vhost_dev_cleanup(dev); memory->nregions = 0; - dev->memory = memory; + RCU_INIT_POINTER(dev->memory, memory); return 0; } @@ -352,8 +352,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev) fput(dev->log_file); dev->log_file = NULL; /* No one will access memory at this point */ - kfree(dev->memory); - dev->memory = NULL; + kfree(rcu_dereference_protected(dev->memory, + lockdep_is_held(&dev->mutex))); + RCU_INIT_POINTER(dev->memory, NULL); if (dev->mm) mmput(dev->mm); dev->mm = NULL; @@ -440,14 +441,22 @@ static int vq_access_ok(unsigned int num, /* Caller should have device mutex but not vq mutex */ int vhost_log_access_ok(struct vhost_dev *dev) { - return memory_access_ok(dev, dev->memory, 1); + struct vhost_memory *mp; + + mp = rcu_dereference_protected(dev->memory, + lockdep_is_held(&dev->mutex)); + return memory_access_ok(dev, mp, 1); } /* Verify access for write logging. */ /* Caller should have vq mutex and device mutex */ static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base) { - return vq_memory_access_ok(log_base, vq->dev->memory, + struct vhost_memory *mp; + + mp = rcu_dereference_protected(vq->dev->memory, + lockdep_is_held(&vq->mutex)); + return vq_memory_access_ok(log_base, mp, vhost_has_feature(vq->dev, VHOST_F_LOG_ALL)) && (!vq->log_used || log_access_ok(log_base, vq->log_addr, sizeof *vq->used + @@ -487,7 +496,8 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) kfree(newmem); return -EFAULT; } - oldmem = d->memory; + oldmem = rcu_dereference_protected(d->memory, + lockdep_is_held(&d->mutex)); rcu_assign_pointer(d->memory, newmem); synchronize_rcu(); kfree(oldmem); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index afd77295971c..af3c11ded5fd 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -106,7 +106,7 @@ struct vhost_virtqueue { * vhost_work execution acts instead of rcu_read_lock() and the end of * vhost_work execution acts instead of rcu_read_lock(). * Writers use virtqueue mutex. */ - void *private_data; + void __rcu *private_data; /* Log write descriptors */ void __user *log_base; struct vhost_log log[VHOST_NET_MAX_SG]; @@ -116,7 +116,7 @@ struct vhost_dev { /* Readers use RCU to access memory table pointer * log base pointer and features. * Writers use mutex below.*/ - struct vhost_memory *memory; + struct vhost_memory __rcu *memory; struct mm_struct *mm; struct mutex mutex; unsigned acked_features; @@ -173,7 +173,11 @@ enum { static inline int vhost_has_feature(struct vhost_dev *dev, int bit) { - unsigned acked_features = rcu_dereference(dev->acked_features); + unsigned acked_features; + + acked_features = + rcu_dereference_index_check(dev->acked_features, + lockdep_is_held(&dev->mutex)); return acked_features & (1 << bit); } diff --git a/fs/affs/super.c b/fs/affs/super.c index 33c4e7eef470..9581ea94d5a1 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -109,8 +109,8 @@ static void init_once(void *foo) { struct affs_inode_info *ei = (struct affs_inode_info *) foo; - init_MUTEX(&ei->i_link_lock); - init_MUTEX(&ei->i_ext_lock); + sema_init(&ei->i_link_lock, 1); + sema_init(&ei->i_ext_lock, 1); inode_init_once(&ei->vfs_inode); } diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index f96eff04e11a..a6395bdb26ae 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -134,10 +134,6 @@ static int aout_core_dump(struct coredump_params *cprm) if (!dump_write(file, dump_start, dump_size)) goto end_coredump; } -/* Finally dump the task struct. Not be used by gdb, but could be useful */ - set_fs(KERNEL_DS); - if (!dump_write(file, current, sizeof(*current))) - goto end_coredump; end_coredump: set_fs(fs); return has_dumped; diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 0fcd2640c23f..9eb134ea6eb2 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -1,9 +1,11 @@ config CEPH_FS tristate "Ceph distributed file system (EXPERIMENTAL)" depends on INET && EXPERIMENTAL + select CEPH_LIB select LIBCRC32C select CRYPTO_AES select CRYPTO + default n help Choose Y or M here to include support for mounting the experimental Ceph distributed file system. Ceph is an extremely @@ -14,15 +16,3 @@ config CEPH_FS If unsure, say N. -config CEPH_FS_PRETTYDEBUG - bool "Include file:line in ceph debug output" - depends on CEPH_FS - default n - help - If you say Y here, debug output will include a filename and - line to aid debugging. This icnreases kernel size and slows - execution slightly when debug call sites are enabled (e.g., - via CONFIG_DYNAMIC_DEBUG). - - If unsure, say N. - diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 278e1172600d..9e6c4f2e8ff1 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -8,15 +8,8 @@ obj-$(CONFIG_CEPH_FS) += ceph.o ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ export.o caps.o snap.o xattr.o \ - messenger.o msgpool.o buffer.o pagelist.o \ - mds_client.o mdsmap.o \ - mon_client.o \ - osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ - debugfs.o \ - auth.o auth_none.o \ - crypto.o armor.o \ - auth_x.o \ - ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o + mds_client.o mdsmap.o strings.o ceph_frag.o \ + debugfs.o else #Otherwise we were called directly from the command diff --git a/fs/ceph/README b/fs/ceph/README deleted file mode 100644 index 18352fab37c0..000000000000 --- a/fs/ceph/README +++ /dev/null @@ -1,20 +0,0 @@ -# -# The following files are shared by (and manually synchronized -# between) the Ceph userland and kernel client. -# -# userland kernel -src/include/ceph_fs.h fs/ceph/ceph_fs.h -src/include/ceph_fs.cc fs/ceph/ceph_fs.c -src/include/msgr.h fs/ceph/msgr.h -src/include/rados.h fs/ceph/rados.h -src/include/ceph_strings.cc fs/ceph/ceph_strings.c -src/include/ceph_frag.h fs/ceph/ceph_frag.h -src/include/ceph_frag.cc fs/ceph/ceph_frag.c -src/include/ceph_hash.h fs/ceph/ceph_hash.h -src/include/ceph_hash.cc fs/ceph/ceph_hash.c -src/crush/crush.c fs/ceph/crush/crush.c -src/crush/crush.h fs/ceph/crush/crush.h -src/crush/mapper.c fs/ceph/crush/mapper.c -src/crush/mapper.h fs/ceph/crush/mapper.h -src/crush/hash.h fs/ceph/crush/hash.h -src/crush/hash.c fs/ceph/crush/hash.c diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index efbc604001c8..51bcc5ce3230 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/backing-dev.h> #include <linux/fs.h> @@ -10,7 +10,8 @@ #include <linux/task_io_accounting_ops.h> #include "super.h" -#include "osd_client.h" +#include "mds_client.h" +#include <linux/ceph/osd_client.h> /* * Ceph address space ops. @@ -193,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page) { struct inode *inode = filp->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; + struct ceph_osd_client *osdc = + &ceph_inode_to_client(inode)->client->osdc; int err = 0; u64 len = PAGE_CACHE_SIZE; @@ -265,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, { struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; + struct ceph_osd_client *osdc = + &ceph_inode_to_client(inode)->client->osdc; int rc = 0; struct page **pages; loff_t offset; @@ -365,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) { struct inode *inode; struct ceph_inode_info *ci; - struct ceph_client *client; + struct ceph_fs_client *fsc; struct ceph_osd_client *osdc; loff_t page_off = page->index << PAGE_CACHE_SHIFT; int len = PAGE_CACHE_SIZE; @@ -383,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } inode = page->mapping->host; ci = ceph_inode(inode); - client = ceph_inode_to_client(inode); - osdc = &client->osdc; + fsc = ceph_inode_to_client(inode); + osdc = &fsc->client->osdc; /* verify this is a writeable snap context */ snapc = (void *)page->private; @@ -414,10 +417,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", inode, page, page->index, page_off, len, snapc); - writeback_stat = atomic_long_inc_return(&client->writeback_count); + writeback_stat = atomic_long_inc_return(&fsc->writeback_count); if (writeback_stat > - CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) - set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); + CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) + set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), @@ -496,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req, struct address_space *mapping = inode->i_mapping; __s32 rc = -EIO; u64 bytes = 0; - struct ceph_client *client = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); long writeback_stat; unsigned issued = ceph_caps_issued(ci); @@ -529,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req, WARN_ON(!PageUptodate(page)); writeback_stat = - atomic_long_dec_return(&client->writeback_count); + atomic_long_dec_return(&fsc->writeback_count); if (writeback_stat < - CONGESTION_OFF_THRESH(client->mount_args->congestion_kb)) - clear_bdi_congested(&client->backing_dev_info, + CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) + clear_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); ceph_put_snap_context((void *)page->private); @@ -569,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req, * mempool. we avoid the mempool if we can because req->r_num_pages * may be less than the maximum write size. */ -static void alloc_page_vec(struct ceph_client *client, +static void alloc_page_vec(struct ceph_fs_client *fsc, struct ceph_osd_request *req) { req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, GFP_NOFS); if (!req->r_pages) { - req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS); + req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); req->r_pages_from_pool = 1; WARN_ON(!req->r_pages); } @@ -590,7 +593,7 @@ static int ceph_writepages_start(struct address_space *mapping, struct inode *inode = mapping->host; struct backing_dev_info *bdi = mapping->backing_dev_info; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_client *client; + struct ceph_fs_client *fsc; pgoff_t index, start, end; int range_whole = 0; int should_loop = 1; @@ -617,13 +620,13 @@ static int ceph_writepages_start(struct address_space *mapping, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); - client = ceph_inode_to_client(inode); - if (client->mount_state == CEPH_MOUNT_SHUTDOWN) { + fsc = ceph_inode_to_client(inode); + if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { pr_warning("writepage_start %p on forced umount\n", inode); return -EIO; /* we're in a forced umount, don't write! */ } - if (client->mount_args->wsize && client->mount_args->wsize < wsize) - wsize = client->mount_args->wsize; + if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) + wsize = fsc->mount_options->wsize; if (wsize < PAGE_CACHE_SIZE) wsize = PAGE_CACHE_SIZE; max_pages_ever = wsize >> PAGE_CACHE_SHIFT; @@ -769,7 +772,7 @@ get_more_pages: offset = (unsigned long long)page->index << PAGE_CACHE_SHIFT; len = wsize; - req = ceph_osdc_new_request(&client->osdc, + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), offset, &len, @@ -782,7 +785,7 @@ get_more_pages: &inode->i_mtime, true, 1); max_pages = req->r_num_pages; - alloc_page_vec(client, req); + alloc_page_vec(fsc, req); req->r_callback = writepages_finish; req->r_inode = inode; } @@ -794,10 +797,10 @@ get_more_pages: inode, page, page->index); writeback_stat = - atomic_long_inc_return(&client->writeback_count); + atomic_long_inc_return(&fsc->writeback_count); if (writeback_stat > CONGESTION_ON_THRESH( - client->mount_args->congestion_kb)) { - set_bdi_congested(&client->backing_dev_info, + fsc->mount_options->congestion_kb)) { + set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); } @@ -846,7 +849,7 @@ get_more_pages: op->payload_len = cpu_to_le32(len); req->r_request->hdr.data_len = cpu_to_le32(len); - ceph_osdc_start_request(&client->osdc, req, true); + ceph_osdc_start_request(&fsc->client->osdc, req, true); req = NULL; /* continue? */ @@ -915,7 +918,7 @@ static int ceph_update_writeable_page(struct file *file, { struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; loff_t page_off = pos & PAGE_CACHE_MASK; int pos_in_page = pos & ~PAGE_CACHE_MASK; int end_in_page = pos_in_page + len; @@ -1053,8 +1056,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = file->f_dentry->d_inode; - struct ceph_client *client = ceph_inode_to_client(inode); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = fsc->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); int check_cap = 0; @@ -1123,7 +1126,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page = vmf->page; - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; loff_t off = page->index << PAGE_CACHE_SHIFT; loff_t size, len; int ret; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 73c153092f72..98ab13e2b71d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/fs.h> #include <linux/kernel.h> @@ -9,8 +9,9 @@ #include <linux/writeback.h> #include "super.h" -#include "decode.h" -#include "messenger.h" +#include "mds_client.h" +#include <linux/ceph/decode.h> +#include <linux/ceph/messenger.h> /* * Capability management @@ -287,11 +288,11 @@ void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) spin_unlock(&mdsc->caps_list_lock); } -void ceph_reservation_status(struct ceph_client *client, +void ceph_reservation_status(struct ceph_fs_client *fsc, int *total, int *avail, int *used, int *reserved, int *min) { - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_client *mdsc = fsc->mdsc; if (total) *total = mdsc->caps_total_count; @@ -399,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci, static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - struct ceph_mount_args *ma = mdsc->client->mount_args; + struct ceph_mount_options *ma = mdsc->fsc->mount_options; ci->i_hold_caps_min = round_jiffies(jiffies + ma->caps_wanted_delay_min * HZ); @@ -515,7 +516,7 @@ int ceph_add_cap(struct inode *inode, unsigned seq, unsigned mseq, u64 realmino, int flags, struct ceph_cap_reservation *caps_reservation) { - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *new_cap = NULL; struct ceph_cap *cap; @@ -873,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap) struct ceph_mds_session *session = cap->session; struct ceph_inode_info *ci = cap->ci; struct ceph_mds_client *mdsc = - &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; int removed = 0; dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); @@ -1210,7 +1211,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci, int mds; struct ceph_cap_snap *capsnap; u32 mseq; - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_session *session = NULL; /* if session != NULL, we hold session->s_mutex */ u64 next_follows = 0; /* keep track of how far we've gotten through the @@ -1336,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) { struct ceph_mds_client *mdsc = - &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; struct inode *inode = &ci->vfs_inode; int was = ci->i_dirty_caps; int dirty = 0; @@ -1378,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) static int __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session) { - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int flushing; @@ -1416,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode, /* * try to invalidate mapping pages without blocking. */ -static int mapping_is_empty(struct address_space *mapping) -{ - struct page *page = find_get_page(mapping, 0); - - if (!page) - return 1; - - put_page(page); - return 0; -} - static int try_nonblocking_invalidate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); @@ -1436,7 +1426,7 @@ static int try_nonblocking_invalidate(struct inode *inode) invalidate_mapping_pages(&inode->i_data, 0, -1); spin_lock(&inode->i_lock); - if (mapping_is_empty(&inode->i_data) && + if (inode->i_data.nrpages == 0 && invalidating_gen == ci->i_rdcache_gen) { /* success. */ dout("try_nonblocking_invalidate %p success\n", inode); @@ -1462,8 +1452,8 @@ static int try_nonblocking_invalidate(struct inode *inode) void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session) { - struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); + struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; int file_wanted, used; @@ -1533,7 +1523,7 @@ retry_locked: */ if ((!is_delayed || mdsc->stopping) && ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ - ci->i_rdcache_gen && /* may have cached pages */ + inode->i_data.nrpages && /* have cached pages */ (file_wanted == 0 || /* no open files */ (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ @@ -1706,7 +1696,7 @@ ack: static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, unsigned *flush_tid) { - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int unlock_session = session ? 0 : 1; int flushing = 0; @@ -1872,7 +1862,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) caps_are_flushed(inode, flush_tid)); } else { struct ceph_mds_client *mdsc = - &ceph_sb_to_client(inode->i_sb)->mdsc; + ceph_sb_to_client(inode->i_sb)->mdsc; spin_lock(&inode->i_lock); if (__ceph_caps_dirty(ci)) @@ -2283,7 +2273,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; - int seq = le32_to_cpu(grant->seq); + unsigned seq = le32_to_cpu(grant->seq); + unsigned issue_seq = le32_to_cpu(grant->issue_seq); int newcaps = le32_to_cpu(grant->caps); int issued, implemented, used, wanted, dirty; u64 size = le64_to_cpu(grant->size); @@ -2295,8 +2286,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, int revoked_rdcache = 0; int queue_invalidate = 0; - dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", - inode, cap, mds, seq, ceph_cap_string(newcaps)); + dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n", + inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps)); dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, inode->i_size); @@ -2392,6 +2383,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } cap->seq = seq; + cap->issue_seq = issue_seq; /* file layout may have changed */ ci->i_layout = grant->layout; @@ -2463,7 +2455,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, __releases(inode->i_lock) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; @@ -2711,7 +2703,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; - struct super_block *sb = mdsc->client->sb; + struct super_block *sb = mdsc->fsc->sb; struct inode *inode; struct ceph_cap *cap; struct ceph_mds_caps *h; @@ -2774,15 +2766,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, if (op == CEPH_CAP_OP_IMPORT) __queue_cap_release(session, vino.ino, cap_id, mseq, seq); - - /* - * send any full release message to try to move things - * along for the mds (who clearly thinks we still have this - * cap). - */ - ceph_add_cap_releases(mdsc, session); - ceph_send_cap_releases(mdsc, session); - goto done; + goto flush_cap_releases; } /* these will work even if we don't have a cap yet */ @@ -2810,7 +2794,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, dout(" no cap on %p ino %llx.%llx from mds%d\n", inode, ceph_ino(inode), ceph_snap(inode), mds); spin_unlock(&inode->i_lock); - goto done; + goto flush_cap_releases; } /* note that each of these drops i_lock for us */ @@ -2834,6 +2818,17 @@ void ceph_handle_caps(struct ceph_mds_session *session, ceph_cap_op_name(op)); } + goto done; + +flush_cap_releases: + /* + * send any full release message to try to move things + * along for the mds (who clearly thinks we still have this + * cap). + */ + ceph_add_cap_releases(mdsc, session); + ceph_send_cap_releases(mdsc, session); + done: mutex_unlock(&session->s_mutex); done_unlocked: diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c index ab6cf35c4091..bdce8b1fbd06 100644 --- a/fs/ceph/ceph_frag.c +++ b/fs/ceph/ceph_frag.c @@ -1,7 +1,8 @@ /* * Ceph 'frag' type */ -#include "types.h" +#include <linux/module.h> +#include <linux/ceph/types.h> int ceph_frag_compare(__u32 a, __u32 b) { diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 6fd8b20a8611..7ae1b3d55b58 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/device.h> #include <linux/slab.h> @@ -7,143 +7,49 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> +#include <linux/ceph/libceph.h> +#include <linux/ceph/mon_client.h> +#include <linux/ceph/auth.h> +#include <linux/ceph/debugfs.h> + #include "super.h" -#include "mds_client.h" -#include "mon_client.h" -#include "auth.h" #ifdef CONFIG_DEBUG_FS -/* - * Implement /sys/kernel/debug/ceph fun - * - * /sys/kernel/debug/ceph/client* - an instance of the ceph client - * .../osdmap - current osdmap - * .../mdsmap - current mdsmap - * .../monmap - current monmap - * .../osdc - active osd requests - * .../mdsc - active mds requests - * .../monc - mon client state - * .../dentry_lru - dump contents of dentry lru - * .../caps - expose cap (reservation) stats - * .../bdi - symlink to ../../bdi/something - */ - -static struct dentry *ceph_debugfs_dir; - -static int monmap_show(struct seq_file *s, void *p) -{ - int i; - struct ceph_client *client = s->private; - - if (client->monc.monmap == NULL) - return 0; - - seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); - for (i = 0; i < client->monc.monmap->num_mon; i++) { - struct ceph_entity_inst *inst = - &client->monc.monmap->mon_inst[i]; - - seq_printf(s, "\t%s%lld\t%s\n", - ENTITY_NAME(inst->name), - pr_addr(&inst->addr.in_addr)); - } - return 0; -} +#include "mds_client.h" static int mdsmap_show(struct seq_file *s, void *p) { int i; - struct ceph_client *client = s->private; + struct ceph_fs_client *fsc = s->private; - if (client->mdsc.mdsmap == NULL) + if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL) return 0; - seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch); - seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root); + seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch); + seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root); seq_printf(s, "session_timeout %d\n", - client->mdsc.mdsmap->m_session_timeout); + fsc->mdsc->mdsmap->m_session_timeout); seq_printf(s, "session_autoclose %d\n", - client->mdsc.mdsmap->m_session_autoclose); - for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) { + fsc->mdsc->mdsmap->m_session_autoclose); + for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) { struct ceph_entity_addr *addr = - &client->mdsc.mdsmap->m_info[i].addr; - int state = client->mdsc.mdsmap->m_info[i].state; + &fsc->mdsc->mdsmap->m_info[i].addr; + int state = fsc->mdsc->mdsmap->m_info[i].state; - seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr), + seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, + ceph_pr_addr(&addr->in_addr), ceph_mds_state_name(state)); } return 0; } -static int osdmap_show(struct seq_file *s, void *p) -{ - int i; - struct ceph_client *client = s->private; - struct rb_node *n; - - if (client->osdc.osdmap == NULL) - return 0; - seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch); - seq_printf(s, "flags%s%s\n", - (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? - " NEARFULL" : "", - (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? - " FULL" : ""); - for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { - struct ceph_pg_pool_info *pool = - rb_entry(n, struct ceph_pg_pool_info, node); - seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", - pool->id, pool->v.pg_num, pool->pg_num_mask, - pool->v.lpg_num, pool->lpg_num_mask); - } - for (i = 0; i < client->osdc.osdmap->max_osd; i++) { - struct ceph_entity_addr *addr = - &client->osdc.osdmap->osd_addr[i]; - int state = client->osdc.osdmap->osd_state[i]; - char sb[64]; - - seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n", - i, pr_addr(&addr->in_addr), - ((client->osdc.osdmap->osd_weight[i]*100) >> 16), - ceph_osdmap_state_str(sb, sizeof(sb), state)); - } - return 0; -} - -static int monc_show(struct seq_file *s, void *p) -{ - struct ceph_client *client = s->private; - struct ceph_mon_generic_request *req; - struct ceph_mon_client *monc = &client->monc; - struct rb_node *rp; - - mutex_lock(&monc->mutex); - - if (monc->have_mdsmap) - seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap); - if (monc->have_osdmap) - seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap); - if (monc->want_next_osdmap) - seq_printf(s, "want next osdmap\n"); - - for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { - __u16 op; - req = rb_entry(rp, struct ceph_mon_generic_request, node); - op = le16_to_cpu(req->request->hdr.type); - if (op == CEPH_MSG_STATFS) - seq_printf(s, "%lld statfs\n", req->tid); - else - seq_printf(s, "%lld unknown\n", req->tid); - } - - mutex_unlock(&monc->mutex); - return 0; -} - +/* + * mdsc debugfs + */ static int mdsc_show(struct seq_file *s, void *p) { - struct ceph_client *client = s->private; - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct rb_node *rp; int pathlen; @@ -214,61 +120,12 @@ static int mdsc_show(struct seq_file *s, void *p) return 0; } -static int osdc_show(struct seq_file *s, void *pp) -{ - struct ceph_client *client = s->private; - struct ceph_osd_client *osdc = &client->osdc; - struct rb_node *p; - - mutex_lock(&osdc->request_mutex); - for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { - struct ceph_osd_request *req; - struct ceph_osd_request_head *head; - struct ceph_osd_op *op; - int num_ops; - int opcode, olen; - int i; - - req = rb_entry(p, struct ceph_osd_request, r_node); - - seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1, - le32_to_cpu(req->r_pgid.pool), - le16_to_cpu(req->r_pgid.ps)); - - head = req->r_request->front.iov_base; - op = (void *)(head + 1); - - num_ops = le16_to_cpu(head->num_ops); - olen = le32_to_cpu(head->object_len); - seq_printf(s, "%.*s", olen, - (const char *)(head->ops + num_ops)); - - if (req->r_reassert_version.epoch) - seq_printf(s, "\t%u'%llu", - (unsigned)le32_to_cpu(req->r_reassert_version.epoch), - le64_to_cpu(req->r_reassert_version.version)); - else - seq_printf(s, "\t"); - - for (i = 0; i < num_ops; i++) { - opcode = le16_to_cpu(op->op); - seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); - op++; - } - - seq_printf(s, "\n"); - } - mutex_unlock(&osdc->request_mutex); - return 0; -} - static int caps_show(struct seq_file *s, void *p) { - struct ceph_client *client = s->private; + struct ceph_fs_client *fsc = s->private; int total, avail, used, reserved, min; - ceph_reservation_status(client, &total, &avail, &used, &reserved, &min); + ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min); seq_printf(s, "total\t\t%d\n" "avail\t\t%d\n" "used\t\t%d\n" @@ -280,8 +137,8 @@ static int caps_show(struct seq_file *s, void *p) static int dentry_lru_show(struct seq_file *s, void *ptr) { - struct ceph_client *client = s->private; - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_dentry_info *di; spin_lock(&mdsc->dentry_lru_lock); @@ -295,199 +152,124 @@ static int dentry_lru_show(struct seq_file *s, void *ptr) return 0; } -#define DEFINE_SHOW_FUNC(name) \ -static int name##_open(struct inode *inode, struct file *file) \ -{ \ - struct seq_file *sf; \ - int ret; \ - \ - ret = single_open(file, name, NULL); \ - sf = file->private_data; \ - sf->private = inode->i_private; \ - return ret; \ -} \ - \ -static const struct file_operations name##_fops = { \ - .open = name##_open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -}; - -DEFINE_SHOW_FUNC(monmap_show) -DEFINE_SHOW_FUNC(mdsmap_show) -DEFINE_SHOW_FUNC(osdmap_show) -DEFINE_SHOW_FUNC(monc_show) -DEFINE_SHOW_FUNC(mdsc_show) -DEFINE_SHOW_FUNC(osdc_show) -DEFINE_SHOW_FUNC(dentry_lru_show) -DEFINE_SHOW_FUNC(caps_show) +CEPH_DEFINE_SHOW_FUNC(mdsmap_show) +CEPH_DEFINE_SHOW_FUNC(mdsc_show) +CEPH_DEFINE_SHOW_FUNC(caps_show) +CEPH_DEFINE_SHOW_FUNC(dentry_lru_show) + +/* + * debugfs + */ static int congestion_kb_set(void *data, u64 val) { - struct ceph_client *client = (struct ceph_client *)data; - - if (client) - client->mount_args->congestion_kb = (int)val; + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; + fsc->mount_options->congestion_kb = (int)val; return 0; } static int congestion_kb_get(void *data, u64 *val) { - struct ceph_client *client = (struct ceph_client *)data; - - if (client) - *val = (u64)client->mount_args->congestion_kb; + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; + *val = (u64)fsc->mount_options->congestion_kb; return 0; } - DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, congestion_kb_set, "%llu\n"); -int __init ceph_debugfs_init(void) -{ - ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); - if (!ceph_debugfs_dir) - return -ENOMEM; - return 0; -} -void ceph_debugfs_cleanup(void) +void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) { - debugfs_remove(ceph_debugfs_dir); + dout("ceph_fs_debugfs_cleanup\n"); + debugfs_remove(fsc->debugfs_bdi); + debugfs_remove(fsc->debugfs_congestion_kb); + debugfs_remove(fsc->debugfs_mdsmap); + debugfs_remove(fsc->debugfs_caps); + debugfs_remove(fsc->debugfs_mdsc); + debugfs_remove(fsc->debugfs_dentry_lru); } -int ceph_debugfs_client_init(struct ceph_client *client) +int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { - int ret = 0; - char name[80]; - - snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, - client->monc.auth->global_id); + char name[100]; + int err = -ENOMEM; - client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); - if (!client->debugfs_dir) - goto out; - - client->monc.debugfs_file = debugfs_create_file("monc", - 0600, - client->debugfs_dir, - client, - &monc_show_fops); - if (!client->monc.debugfs_file) + dout("ceph_fs_debugfs_init\n"); + fsc->debugfs_congestion_kb = + debugfs_create_file("writeback_congestion_kb", + 0600, + fsc->client->debugfs_dir, + fsc, + &congestion_kb_fops); + if (!fsc->debugfs_congestion_kb) goto out; - client->mdsc.debugfs_file = debugfs_create_file("mdsc", - 0600, - client->debugfs_dir, - client, - &mdsc_show_fops); - if (!client->mdsc.debugfs_file) - goto out; + dout("a\n"); - client->osdc.debugfs_file = debugfs_create_file("osdc", - 0600, - client->debugfs_dir, - client, - &osdc_show_fops); - if (!client->osdc.debugfs_file) + snprintf(name, sizeof(name), "../../bdi/%s", + dev_name(fsc->backing_dev_info.dev)); + fsc->debugfs_bdi = + debugfs_create_symlink("bdi", + fsc->client->debugfs_dir, + name); + if (!fsc->debugfs_bdi) goto out; - client->debugfs_monmap = debugfs_create_file("monmap", + dout("b\n"); + fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", 0600, - client->debugfs_dir, - client, - &monmap_show_fops); - if (!client->debugfs_monmap) - goto out; - - client->debugfs_mdsmap = debugfs_create_file("mdsmap", - 0600, - client->debugfs_dir, - client, + fsc->client->debugfs_dir, + fsc, &mdsmap_show_fops); - if (!client->debugfs_mdsmap) - goto out; - - client->debugfs_osdmap = debugfs_create_file("osdmap", - 0600, - client->debugfs_dir, - client, - &osdmap_show_fops); - if (!client->debugfs_osdmap) + if (!fsc->debugfs_mdsmap) goto out; - client->debugfs_dentry_lru = debugfs_create_file("dentry_lru", - 0600, - client->debugfs_dir, - client, - &dentry_lru_show_fops); - if (!client->debugfs_dentry_lru) + dout("ca\n"); + fsc->debugfs_mdsc = debugfs_create_file("mdsc", + 0600, + fsc->client->debugfs_dir, + fsc, + &mdsc_show_fops); + if (!fsc->debugfs_mdsc) goto out; - client->debugfs_caps = debugfs_create_file("caps", + dout("da\n"); + fsc->debugfs_caps = debugfs_create_file("caps", 0400, - client->debugfs_dir, - client, + fsc->client->debugfs_dir, + fsc, &caps_show_fops); - if (!client->debugfs_caps) + if (!fsc->debugfs_caps) goto out; - client->debugfs_congestion_kb = - debugfs_create_file("writeback_congestion_kb", - 0600, - client->debugfs_dir, - client, - &congestion_kb_fops); - if (!client->debugfs_congestion_kb) + dout("ea\n"); + fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", + 0600, + fsc->client->debugfs_dir, + fsc, + &dentry_lru_show_fops); + if (!fsc->debugfs_dentry_lru) goto out; - sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev)); - client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir, - name); - return 0; out: - ceph_debugfs_client_cleanup(client); - return ret; + ceph_fs_debugfs_cleanup(fsc); + return err; } -void ceph_debugfs_client_cleanup(struct ceph_client *client) -{ - debugfs_remove(client->debugfs_bdi); - debugfs_remove(client->debugfs_caps); - debugfs_remove(client->debugfs_dentry_lru); - debugfs_remove(client->debugfs_osdmap); - debugfs_remove(client->debugfs_mdsmap); - debugfs_remove(client->debugfs_monmap); - debugfs_remove(client->osdc.debugfs_file); - debugfs_remove(client->mdsc.debugfs_file); - debugfs_remove(client->monc.debugfs_file); - debugfs_remove(client->debugfs_congestion_kb); - debugfs_remove(client->debugfs_dir); -} #else /* CONFIG_DEBUG_FS */ -int __init ceph_debugfs_init(void) -{ - return 0; -} - -void ceph_debugfs_cleanup(void) -{ -} - -int ceph_debugfs_client_init(struct ceph_client *client) +int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { return 0; } -void ceph_debugfs_client_cleanup(struct ceph_client *client) +void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) { } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index a1986eb52045..e0a2dc6fcafc 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/spinlock.h> #include <linux/fs_struct.h> @@ -7,6 +7,7 @@ #include <linux/sched.h> #include "super.h" +#include "mds_client.h" /* * Directory operations: readdir, lookup, create, link, unlink, @@ -94,10 +95,7 @@ static unsigned fpos_off(loff_t p) */ static int __dcache_readdir(struct file *filp, void *dirent, filldir_t filldir) - __releases(inode->i_lock) - __acquires(inode->i_lock) { - struct inode *inode = filp->f_dentry->d_inode; struct ceph_file_info *fi = filp->private_data; struct dentry *parent = filp->f_dentry; struct inode *dir = parent->d_inode; @@ -153,7 +151,6 @@ more: atomic_inc(&dentry->d_count); spin_unlock(&dcache_lock); - spin_unlock(&inode->i_lock); dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); @@ -171,35 +168,30 @@ more: } else { dput(last); } - last = NULL; } - - spin_lock(&inode->i_lock); - spin_lock(&dcache_lock); - last = dentry; if (err < 0) - goto out_unlock; + goto out; - p = p->prev; filp->f_pos++; /* make sure a dentry wasn't dropped while we didn't have dcache_lock */ - if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE)) - goto more; - dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); - err = -EAGAIN; + if (!ceph_i_test(dir, CEPH_I_COMPLETE)) { + dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); + err = -EAGAIN; + goto out; + } + + spin_lock(&dcache_lock); + p = p->prev; /* advance to next dentry */ + goto more; out_unlock: spin_unlock(&dcache_lock); - - if (last) { - spin_unlock(&inode->i_lock); +out: + if (last) dput(last); - spin_lock(&inode->i_lock); - } - return err; } @@ -227,15 +219,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) struct ceph_file_info *fi = filp->private_data; struct inode *inode = filp->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_client *client = ceph_inode_to_client(inode); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = fsc->mdsc; unsigned frag = fpos_frag(filp->f_pos); int off = fpos_off(filp->f_pos); int err; u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; - const int max_entries = client->mount_args->max_readdir; - const int max_bytes = client->mount_args->max_readdir_bytes; + const int max_entries = fsc->mount_options->max_readdir; + const int max_bytes = fsc->mount_options->max_readdir_bytes; dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); if (fi->at_end) @@ -267,17 +259,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) /* can we use the dcache? */ spin_lock(&inode->i_lock); if ((filp->f_pos == 2 || fi->dentry) && - !ceph_test_opt(client, NOASYNCREADDIR) && + !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && (ci->i_ceph_flags & CEPH_I_COMPLETE) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { + spin_unlock(&inode->i_lock); err = __dcache_readdir(filp, dirent, filldir); - if (err != -EAGAIN) { - spin_unlock(&inode->i_lock); + if (err != -EAGAIN) return err; - } + } else { + spin_unlock(&inode->i_lock); } - spin_unlock(&inode->i_lock); if (fi->dentry) { err = note_last_dentry(fi, fi->dentry->d_name.name, fi->dentry->d_name.len); @@ -487,14 +479,13 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err) { - struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *parent = dentry->d_parent->d_inode; /* .snap dir? */ if (err == -ENOENT && - ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */ strcmp(dentry->d_name.name, - client->mount_args->snapdir_name) == 0) { + fsc->mount_options->snapdir_name) == 0) { struct inode *inode = ceph_get_snapdir(parent); dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", dentry, dentry->d_name.len, dentry->d_name.name, inode); @@ -539,8 +530,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int op; int err; @@ -572,7 +563,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, spin_lock(&dir->i_lock); dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); if (strncmp(dentry->d_name.name, - client->mount_args->snapdir_name, + fsc->mount_options->snapdir_name, dentry->d_name.len) && !is_root_ceph_dentry(dir, dentry) && (ci->i_ceph_flags & CEPH_I_COMPLETE) && @@ -629,8 +620,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) static int ceph_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; @@ -685,8 +676,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode, static int ceph_symlink(struct inode *dir, struct dentry *dentry, const char *dest) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; @@ -716,8 +707,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err = -EROFS; int op; @@ -758,8 +749,8 @@ out: static int ceph_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; @@ -813,8 +804,8 @@ static int drop_caps_for_unlink(struct inode *inode) */ static int ceph_unlink(struct inode *dir, struct dentry *dentry) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = dentry->d_inode; struct ceph_mds_request *req; int err = -EROFS; @@ -854,8 +845,8 @@ out: static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; @@ -1076,7 +1067,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, struct ceph_inode_info *ci = ceph_inode(inode); int left; - if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) + if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) return -EISDIR; if (!cf->dir_info) { @@ -1177,7 +1168,7 @@ void ceph_dentry_lru_add(struct dentry *dn) dout("dentry_lru_add %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); if (di) { - mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_add_tail(&di->lru, &mdsc->dentry_lru); mdsc->num_dentry++; @@ -1193,7 +1184,7 @@ void ceph_dentry_lru_touch(struct dentry *dn) dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, dn->d_name.len, dn->d_name.name, di->offset); if (di) { - mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_move_tail(&di->lru, &mdsc->dentry_lru); spin_unlock(&mdsc->dentry_lru_lock); @@ -1208,7 +1199,7 @@ void ceph_dentry_lru_del(struct dentry *dn) dout("dentry_lru_del %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); if (di) { - mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_del_init(&di->lru); mdsc->num_dentry--; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 4480cb1c63e7..2297d9426992 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -1,10 +1,11 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/exportfs.h> #include <linux/slab.h> #include <asm/unaligned.h> #include "super.h" +#include "mds_client.h" /* * NFS export support @@ -42,32 +43,37 @@ struct ceph_nfs_confh { static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, int connectable) { + int type; struct ceph_nfs_fh *fh = (void *)rawfh; struct ceph_nfs_confh *cfh = (void *)rawfh; struct dentry *parent = dentry->d_parent; struct inode *inode = dentry->d_inode; - int type; + int connected_handle_length = sizeof(*cfh)/4; + int handle_length = sizeof(*fh)/4; /* don't re-export snaps */ if (ceph_snap(inode) != CEPH_NOSNAP) return -EINVAL; - if (*max_len >= sizeof(*cfh)) { + if (*max_len >= connected_handle_length) { dout("encode_fh %p connectable\n", dentry); cfh->ino = ceph_ino(dentry->d_inode); cfh->parent_ino = ceph_ino(parent->d_inode); cfh->parent_name_hash = parent->d_name.hash; - *max_len = sizeof(*cfh); + *max_len = connected_handle_length; type = 2; - } else if (*max_len > sizeof(*fh)) { - if (connectable) - return -ENOSPC; + } else if (*max_len >= handle_length) { + if (connectable) { + *max_len = connected_handle_length; + return 255; + } dout("encode_fh %p\n", dentry); fh->ino = ceph_ino(dentry->d_inode); - *max_len = sizeof(*fh); + *max_len = handle_length; type = 1; } else { - return -ENOSPC; + *max_len = handle_length; + return 255; } return type; } @@ -115,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, static struct dentry *__cfh_to_dentry(struct super_block *sb, struct ceph_nfs_confh *cfh) { - struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; struct inode *inode; struct dentry *dentry; struct ceph_vino vino; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8c044a4f0457..e77c28cf3690 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1,5 +1,6 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> +#include <linux/module.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/file.h> @@ -38,8 +39,8 @@ static struct ceph_mds_request * prepare_open_request(struct super_block *sb, int flags, int create_mode) { - struct ceph_client *client = ceph_sb_to_client(sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int want_auth = USE_ANY_MDS; int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; @@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) int ceph_open(struct inode *inode, struct file *file) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_client *client = ceph_sb_to_client(inode->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct ceph_file_info *cf = file->private_data; struct inode *parent_inode = file->f_dentry->d_parent->d_inode; @@ -216,8 +217,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd, int mode, int locked_dir) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct file *file = nd->intent.open.file; struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); struct ceph_mds_request *req; @@ -270,163 +271,6 @@ int ceph_release(struct inode *inode, struct file *file) } /* - * build a vector of user pages - */ -static struct page **get_direct_page_vector(const char __user *data, - int num_pages, - loff_t off, size_t len) -{ - struct page **pages; - int rc; - - pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); - if (!pages) - return ERR_PTR(-ENOMEM); - - down_read(¤t->mm->mmap_sem); - rc = get_user_pages(current, current->mm, (unsigned long)data, - num_pages, 0, 0, pages, NULL); - up_read(¤t->mm->mmap_sem); - if (rc < 0) - goto fail; - return pages; - -fail: - kfree(pages); - return ERR_PTR(rc); -} - -static void put_page_vector(struct page **pages, int num_pages) -{ - int i; - - for (i = 0; i < num_pages; i++) - put_page(pages[i]); - kfree(pages); -} - -void ceph_release_page_vector(struct page **pages, int num_pages) -{ - int i; - - for (i = 0; i < num_pages; i++) - __free_pages(pages[i], 0); - kfree(pages); -} - -/* - * allocate a vector new pages - */ -static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) -{ - struct page **pages; - int i; - - pages = kmalloc(sizeof(*pages) * num_pages, flags); - if (!pages) - return ERR_PTR(-ENOMEM); - for (i = 0; i < num_pages; i++) { - pages[i] = __page_cache_alloc(flags); - if (pages[i] == NULL) { - ceph_release_page_vector(pages, i); - return ERR_PTR(-ENOMEM); - } - } - return pages; -} - -/* - * copy user data into a page vector - */ -static int copy_user_to_page_vector(struct page **pages, - const char __user *data, - loff_t off, size_t len) -{ - int i = 0; - int po = off & ~PAGE_CACHE_MASK; - int left = len; - int l, bad; - - while (left > 0) { - l = min_t(int, PAGE_CACHE_SIZE-po, left); - bad = copy_from_user(page_address(pages[i]) + po, data, l); - if (bad == l) - return -EFAULT; - data += l - bad; - left -= l - bad; - po += l - bad; - if (po == PAGE_CACHE_SIZE) { - po = 0; - i++; - } - } - return len; -} - -/* - * copy user data from a page vector into a user pointer - */ -static int copy_page_vector_to_user(struct page **pages, char __user *data, - loff_t off, size_t len) -{ - int i = 0; - int po = off & ~PAGE_CACHE_MASK; - int left = len; - int l, bad; - - while (left > 0) { - l = min_t(int, left, PAGE_CACHE_SIZE-po); - bad = copy_to_user(data, page_address(pages[i]) + po, l); - if (bad == l) - return -EFAULT; - data += l - bad; - left -= l - bad; - if (po) { - po += l - bad; - if (po == PAGE_CACHE_SIZE) - po = 0; - } - i++; - } - return len; -} - -/* - * Zero an extent within a page vector. Offset is relative to the - * start of the first page. - */ -static void zero_page_vector_range(int off, int len, struct page **pages) -{ - int i = off >> PAGE_CACHE_SHIFT; - - off &= ~PAGE_CACHE_MASK; - - dout("zero_page_vector_page %u~%u\n", off, len); - - /* leading partial page? */ - if (off) { - int end = min((int)PAGE_CACHE_SIZE, off + len); - dout("zeroing %d %p head from %d\n", i, pages[i], - (int)off); - zero_user_segment(pages[i], off, end); - len -= (end - off); - i++; - } - while (len >= PAGE_CACHE_SIZE) { - dout("zeroing %d %p len=%d\n", i, pages[i], len); - zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); - len -= PAGE_CACHE_SIZE; - i++; - } - /* trailing partial page? */ - if (len) { - dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len); - zero_user_segment(pages[i], 0, len); - } -} - - -/* * Read a range of bytes striped over one or more objects. Iterate over * objects we stripe over. (That's not atomic, but good enough for now.) * @@ -438,7 +282,7 @@ static int striped_read(struct inode *inode, struct page **pages, int num_pages, int *checkeof) { - struct ceph_client *client = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 pos, this_len; int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ @@ -459,7 +303,7 @@ static int striped_read(struct inode *inode, more: this_len = left; - ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode), + ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, pos, &this_len, ci->i_truncate_seq, ci->i_truncate_size, @@ -477,8 +321,8 @@ more: if (read < pos - off) { dout(" zero gap %llu to %llu\n", off + read, pos); - zero_page_vector_range(page_off + read, - pos - off - read, pages); + ceph_zero_page_vector_range(page_off + read, + pos - off - read, pages); } pos += ret; read = pos - off; @@ -495,8 +339,8 @@ more: /* was original extent fully inside i_size? */ if (pos + left <= inode->i_size) { dout("zero tail\n"); - zero_page_vector_range(page_off + read, len - read, - pages); + ceph_zero_page_vector_range(page_off + read, len - read, + pages); read = len; goto out; } @@ -531,7 +375,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); if (file->f_flags & O_DIRECT) { - pages = get_direct_page_vector(data, num_pages, off, len); + pages = ceph_get_direct_page_vector(data, num_pages, off, len); /* * flush any page cache pages in this range. this @@ -552,13 +396,13 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, ret = striped_read(inode, off, len, pages, num_pages, checkeof); if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) - ret = copy_page_vector_to_user(pages, data, off, ret); + ret = ceph_copy_page_vector_to_user(pages, data, off, ret); if (ret >= 0) *poff = off + ret; done: if (file->f_flags & O_DIRECT) - put_page_vector(pages, num_pages); + ceph_put_page_vector(pages, num_pages); else ceph_release_page_vector(pages, num_pages); dout("sync_read result %d\n", ret); @@ -594,7 +438,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, { struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_client *client = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req; struct page **pages; int num_pages; @@ -642,7 +486,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, */ more: len = left; - req = ceph_osdc_new_request(&client->osdc, &ci->i_layout, + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), pos, &len, CEPH_OSD_OP_WRITE, flags, ci->i_snap_realm->cached_context, @@ -655,7 +499,7 @@ more: num_pages = calc_pages_for(pos, len); if (file->f_flags & O_DIRECT) { - pages = get_direct_page_vector(data, num_pages, pos, len); + pages = ceph_get_direct_page_vector(data, num_pages, pos, len); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; @@ -673,7 +517,7 @@ more: ret = PTR_ERR(pages); goto out; } - ret = copy_user_to_page_vector(pages, data, pos, len); + ret = ceph_copy_user_to_page_vector(pages, data, pos, len); if (ret < 0) { ceph_release_page_vector(pages, num_pages); goto out; @@ -689,7 +533,7 @@ more: req->r_num_pages = num_pages; req->r_inode = inode; - ret = ceph_osdc_start_request(&client->osdc, req, false); + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) { if (req->r_safe_callback) { /* @@ -697,15 +541,15 @@ more: * start_request so that a tid has been assigned. */ spin_lock(&ci->i_unsafe_lock); - list_add(&ci->i_unsafe_writes, &req->r_unsafe_item); + list_add(&req->r_unsafe_item, &ci->i_unsafe_writes); spin_unlock(&ci->i_unsafe_lock); ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); } - ret = ceph_osdc_wait_request(&client->osdc, req); + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); } if (file->f_flags & O_DIRECT) - put_page_vector(pages, num_pages); + ceph_put_page_vector(pages, num_pages); else if (file->f_flags & O_SYNC) ceph_release_page_vector(pages, num_pages); @@ -814,7 +658,8 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; + struct ceph_osd_client *osdc = + &ceph_sb_to_client(inode->i_sb)->client->osdc; loff_t endoff = pos + iov->iov_len; int want, got = 0; int ret, err; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 62377ec37edf..1d6a45b5a04c 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/module.h> #include <linux/fs.h> @@ -13,7 +13,8 @@ #include <linux/pagevec.h> #include "super.h" -#include "decode.h" +#include "mds_client.h" +#include <linux/ceph/decode.h> /* * Ceph inode operations @@ -384,7 +385,7 @@ void ceph_destroy_inode(struct inode *inode) */ if (ci->i_snap_realm) { struct ceph_mds_client *mdsc = - &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; struct ceph_snap_realm *realm = ci->i_snap_realm; dout(" dropping residual ref to snap realm %p\n", realm); @@ -685,7 +686,7 @@ static int fill_inode(struct inode *inode, } /* it may be better to set st_size in getattr instead? */ - if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) + if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) inode->i_size = ci->i_rbytes; break; default: @@ -901,7 +902,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, struct inode *in = NULL; struct ceph_mds_reply_inode *ininfo; struct ceph_vino vino; - struct ceph_client *client = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); int i = 0; int err = 0; @@ -965,7 +966,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, */ if (rinfo->head->is_dentry && !req->r_aborted && (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, - client->mount_args->snapdir_name, + fsc->mount_options->snapdir_name, req->r_dentry->d_name.len))) { /* * lookup link rename : null -> possibly existing inode @@ -1533,7 +1534,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) struct inode *parent_inode = dentry->d_parent->d_inode; const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; - struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; int issued; int release = 0, dirtied = 0; int mask = 0; @@ -1728,8 +1729,8 @@ out: */ int ceph_do_getattr(struct inode *inode, int mask) { - struct ceph_client *client = ceph_sb_to_client(inode->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 76e307d2aba1..8888c9ba68db 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -1,8 +1,10 @@ #include <linux/in.h> -#include "ioctl.h" #include "super.h" -#include "ceph_debug.h" +#include "mds_client.h" +#include <linux/ceph/ceph_debug.h> + +#include "ioctl.h" /* @@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { struct inode *inode = file->f_dentry->d_inode; struct inode *parent_inode = file->f_dentry->d_parent->d_inode; - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; int err, i; @@ -90,6 +92,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) } /* + * Set a layout policy on a directory inode. All items in the tree + * rooted at this inode will inherit this layout on creation, + * (It doesn't apply retroactively ) + * unless a subdirectory has its own layout policy. + */ +static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_mds_request *req; + struct ceph_ioctl_layout l; + int err, i; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + + /* copy and validate */ + if (copy_from_user(&l, arg, sizeof(l))) + return -EFAULT; + + if ((l.object_size & ~PAGE_MASK) || + (l.stripe_unit & ~PAGE_MASK) || + !l.stripe_unit || + (l.object_size && + (unsigned)l.object_size % (unsigned)l.stripe_unit)) + return -EINVAL; + + /* make sure it's a valid data pool */ + if (l.data_pool > 0) { + mutex_lock(&mdsc->mutex); + err = -EINVAL; + for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++) + if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) { + err = 0; + break; + } + mutex_unlock(&mdsc->mutex); + if (err) + return err; + } + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT, + USE_AUTH_MDS); + + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = igrab(inode); + + req->r_args.setlayout.layout.fl_stripe_unit = + cpu_to_le32(l.stripe_unit); + req->r_args.setlayout.layout.fl_stripe_count = + cpu_to_le32(l.stripe_count); + req->r_args.setlayout.layout.fl_object_size = + cpu_to_le32(l.object_size); + req->r_args.setlayout.layout.fl_pg_pool = + cpu_to_le32(l.data_pool); + req->r_args.setlayout.layout.fl_pg_preferred = + cpu_to_le32(l.preferred_osd); + + err = ceph_mdsc_do_request(mdsc, inode, req); + ceph_mdsc_put_request(req); + return err; +} + +/* * Return object name, size/offset information, and location (OSD * number, network address) for a given file offset. */ @@ -98,7 +162,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) struct ceph_ioctl_dataloc dl; struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; + struct ceph_osd_client *osdc = + &ceph_sb_to_client(inode->i_sb)->client->osdc; u64 len = 1, olen; u64 tmp; struct ceph_object_layout ol; @@ -174,11 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case CEPH_IOC_SET_LAYOUT: return ceph_ioctl_set_layout(file, (void __user *)arg); + case CEPH_IOC_SET_LAYOUT_POLICY: + return ceph_ioctl_set_layout_policy(file, (void __user *)arg); + case CEPH_IOC_GET_DATALOC: return ceph_ioctl_get_dataloc(file, (void __user *)arg); case CEPH_IOC_LAZYIO: return ceph_ioctl_lazyio(file); } + return -ENOTTY; } diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index 88451a3b6857..a6ce54e94eb5 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -4,7 +4,7 @@ #include <linux/ioctl.h> #include <linux/types.h> -#define CEPH_IOCTL_MAGIC 0x97 +#define CEPH_IOCTL_MAGIC 0x98 /* just use u64 to align sanely on all archs */ struct ceph_ioctl_layout { @@ -17,6 +17,8 @@ struct ceph_ioctl_layout { struct ceph_ioctl_layout) #define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \ struct ceph_ioctl_layout) +#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \ + struct ceph_ioctl_layout) /* * Extract identity, address of the OSD and object storing a given diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index ff4e753aae92..40abde93c345 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -1,11 +1,11 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/file.h> #include <linux/namei.h> #include "super.h" #include "mds_client.h" -#include "pagelist.h" +#include <linux/ceph/pagelist.h> /** * Implement fcntl and flock locking functions. @@ -16,7 +16,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, { struct inode *inode = file->f_dentry->d_inode; struct ceph_mds_client *mdsc = - &ceph_sb_to_client(inode->i_sb)->mdsc; + ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; int err; @@ -181,8 +181,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) * Encode the flock and fcntl locks for the given inode into the pagelist. * Format is: #fcntl locks, sequential fcntl locks, #flock locks, * sequential flock locks. - * Must be called with BLK already held, and the lock numbers should have - * been gathered under the same lock holding window. + * Must be called with lock_flocks() already held. + * If we encounter more of a specific lock type than expected, + * we return the value 1. */ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, int num_fcntl_locks, int num_flock_locks) @@ -190,6 +191,8 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, struct file_lock *lock; struct ceph_filelock cephlock; int err = 0; + int seen_fcntl = 0; + int seen_flock = 0; dout("encoding %d flock and %d fcntl locks", num_flock_locks, num_fcntl_locks); @@ -198,6 +201,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, goto fail; for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { if (lock->fl_flags & FL_POSIX) { + ++seen_fcntl; + if (seen_fcntl > num_fcntl_locks) { + err = -ENOSPC; + goto fail; + } err = lock_to_ceph_filelock(lock, &cephlock); if (err) goto fail; @@ -213,6 +221,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, goto fail; for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { if (lock->fl_flags & FL_FLOCK) { + ++seen_flock; + if (seen_flock > num_flock_locks) { + err = -ENOSPC; + goto fail; + } err = lock_to_ceph_filelock(lock, &cephlock); if (err) goto fail; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index fad95f8f2608..3142b15940c2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1,17 +1,21 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> +#include <linux/fs.h> #include <linux/wait.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> #include <linux/smp_lock.h> -#include "mds_client.h" -#include "mon_client.h" #include "super.h" -#include "messenger.h" -#include "decode.h" -#include "auth.h" -#include "pagelist.h" +#include "mds_client.h" + +#include <linux/ceph/messenger.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/pagelist.h> +#include <linux/ceph/auth.h> +#include <linux/ceph/debugfs.h> /* * A cluster of MDS (metadata server) daemons is responsible for @@ -286,8 +290,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s) atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); if (atomic_dec_and_test(&s->s_ref)) { if (s->s_authorizer) - s->s_mdsc->client->monc.auth->ops->destroy_authorizer( - s->s_mdsc->client->monc.auth, s->s_authorizer); + s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer( + s->s_mdsc->fsc->client->monc.auth, + s->s_authorizer); kfree(s); } } @@ -344,7 +349,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_seq = 0; mutex_init(&s->s_mutex); - ceph_con_init(mdsc->client->msgr, &s->s_con); + ceph_con_init(mdsc->fsc->client->msgr, &s->s_con); s->s_con.private = s; s->s_con.ops = &mds_con_ops; s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; @@ -599,7 +604,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, } else if (req->r_dentry) { struct inode *dir = req->r_dentry->d_parent->d_inode; - if (dir->i_sb != mdsc->client->sb) { + if (dir->i_sb != mdsc->fsc->sb) { /* not this fs! */ inode = req->r_dentry->d_inode; } else if (ceph_snap(dir) != CEPH_NOSNAP) { @@ -884,7 +889,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, __ceph_remove_cap(cap); if (!__ceph_is_any_real_caps(ci)) { struct ceph_mds_client *mdsc = - &ceph_sb_to_client(inode->i_sb)->mdsc; + ceph_sb_to_client(inode->i_sb)->mdsc; spin_lock(&mdsc->cap_dirty_lock); if (!list_empty(&ci->i_dirty_item)) { @@ -1146,7 +1151,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc, struct ceph_msg *msg, *partial = NULL; struct ceph_mds_cap_release *head; int err = -ENOMEM; - int extra = mdsc->client->mount_args->cap_release_safety; + int extra = mdsc->fsc->mount_options->cap_release_safety; int num; dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, @@ -2085,7 +2090,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* insert trace into our cache */ mutex_lock(&req->r_fill_mutex); - err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); + err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); if (err == 0) { if (result == 0 && rinfo->dir_nr) ceph_readdir_prepopulate(req, req->r_session); @@ -2361,19 +2366,35 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, if (recon_state->flock) { int num_fcntl_locks, num_flock_locks; - - lock_kernel(); - ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); - rec.v2.flock_len = (2*sizeof(u32) + - (num_fcntl_locks+num_flock_locks) * - sizeof(struct ceph_filelock)); - - err = ceph_pagelist_append(pagelist, &rec, reclen); - if (!err) - err = ceph_encode_locks(inode, pagelist, - num_fcntl_locks, - num_flock_locks); - unlock_kernel(); + struct ceph_pagelist_cursor trunc_point; + + ceph_pagelist_set_cursor(pagelist, &trunc_point); + do { + lock_flocks(); + ceph_count_locks(inode, &num_fcntl_locks, + &num_flock_locks); + rec.v2.flock_len = (2*sizeof(u32) + + (num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock)); + unlock_flocks(); + + /* pre-alloc pagelist */ + ceph_pagelist_truncate(pagelist, &trunc_point); + err = ceph_pagelist_append(pagelist, &rec, reclen); + if (!err) + err = ceph_pagelist_reserve(pagelist, + rec.v2.flock_len); + + /* encode locks */ + if (!err) { + lock_flocks(); + err = ceph_encode_locks(inode, + pagelist, + num_fcntl_locks, + num_flock_locks); + unlock_flocks(); + } + } while (err == -ENOSPC); } else { err = ceph_pagelist_append(pagelist, &rec, reclen); } @@ -2613,7 +2634,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { - struct super_block *sb = mdsc->client->sb; + struct super_block *sb = mdsc->fsc->sb; struct inode *inode; struct ceph_inode_info *ci; struct dentry *parent, *dentry; @@ -2891,10 +2912,16 @@ static void delayed_work(struct work_struct *work) schedule_delayed(mdsc); } +int ceph_mdsc_init(struct ceph_fs_client *fsc) -int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) { - mdsc->client = client; + struct ceph_mds_client *mdsc; + + mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS); + if (!mdsc) + return -ENOMEM; + mdsc->fsc = fsc; + fsc->mdsc = mdsc; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); if (mdsc->mdsmap == NULL) @@ -2927,7 +2954,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) INIT_LIST_HEAD(&mdsc->dentry_lru); ceph_caps_init(mdsc); - ceph_adjust_min_caps(mdsc, client->min_caps); + ceph_adjust_min_caps(mdsc, fsc->min_caps); return 0; } @@ -2939,7 +2966,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) static void wait_requests(struct ceph_mds_client *mdsc) { struct ceph_mds_request *req; - struct ceph_client *client = mdsc->client; + struct ceph_fs_client *fsc = mdsc->fsc; mutex_lock(&mdsc->mutex); if (__get_oldest_req(mdsc)) { @@ -2947,7 +2974,7 @@ static void wait_requests(struct ceph_mds_client *mdsc) dout("wait_requests waiting for requests\n"); wait_for_completion_timeout(&mdsc->safe_umount_waiters, - client->mount_args->mount_timeout * HZ); + fsc->client->options->mount_timeout * HZ); /* tear down remaining requests */ mutex_lock(&mdsc->mutex); @@ -3030,7 +3057,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { u64 want_tid, want_flush; - if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) + if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) return; dout("sync\n"); @@ -3053,7 +3080,7 @@ bool done_closing_sessions(struct ceph_mds_client *mdsc) { int i, n = 0; - if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) + if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) return true; mutex_lock(&mdsc->mutex); @@ -3071,8 +3098,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { struct ceph_mds_session *session; int i; - struct ceph_client *client = mdsc->client; - unsigned long timeout = client->mount_args->mount_timeout * HZ; + struct ceph_fs_client *fsc = mdsc->fsc; + unsigned long timeout = fsc->client->options->mount_timeout * HZ; dout("close_sessions\n"); @@ -3119,7 +3146,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) dout("stopped\n"); } -void ceph_mdsc_stop(struct ceph_mds_client *mdsc) +static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) { dout("stop\n"); cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ @@ -3129,6 +3156,15 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc) ceph_caps_finalize(mdsc); } +void ceph_mdsc_destroy(struct ceph_fs_client *fsc) +{ + struct ceph_mds_client *mdsc = fsc->mdsc; + + ceph_mdsc_stop(mdsc); + fsc->mdsc = NULL; + kfree(mdsc); +} + /* * handle mds map update. @@ -3145,14 +3181,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); ceph_decode_copy(&p, &fsid, sizeof(fsid)); - if (ceph_check_fsid(mdsc->client, &fsid) < 0) + if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0) return; epoch = ceph_decode_32(&p); maplen = ceph_decode_32(&p); dout("handle_map epoch %u len %d\n", epoch, (int)maplen); /* do we need it? */ - ceph_monc_got_mdsmap(&mdsc->client->monc, epoch); + ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch); mutex_lock(&mdsc->mutex); if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { dout("handle_map epoch %u <= our %u\n", @@ -3176,7 +3212,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) } else { mdsc->mdsmap = newmap; /* first mds map */ } - mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; + mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; __wake_requests(mdsc, &mdsc->waiting_for_map); @@ -3277,7 +3313,7 @@ static int get_authorizer(struct ceph_connection *con, { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; - struct ceph_auth_client *ac = mdsc->client->monc.auth; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; int ret = 0; if (force_new && s->s_authorizer) { @@ -3311,7 +3347,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; - struct ceph_auth_client *ac = mdsc->client->monc.auth; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); } @@ -3320,12 +3356,12 @@ static int invalidate_authorizer(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; - struct ceph_auth_client *ac = mdsc->client->monc.auth; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; if (ac->ops->invalidate_authorizer) ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); - return ceph_monc_validate_auth(&mdsc->client->monc); + return ceph_monc_validate_auth(&mdsc->fsc->client->monc); } static const struct ceph_connection_operations mds_con_ops = { @@ -3338,7 +3374,4 @@ static const struct ceph_connection_operations mds_con_ops = { .peer_reset = peer_reset, }; - - - /* eof */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index c98267ce6d2a..d66d63c72355 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -8,9 +8,9 @@ #include <linux/rbtree.h> #include <linux/spinlock.h> -#include "types.h" -#include "messenger.h" -#include "mdsmap.h" +#include <linux/ceph/types.h> +#include <linux/ceph/messenger.h> +#include <linux/ceph/mdsmap.h> /* * Some lock dependencies: @@ -26,7 +26,7 @@ * */ -struct ceph_client; +struct ceph_fs_client; struct ceph_cap; /* @@ -230,7 +230,7 @@ struct ceph_mds_request { * mds client state */ struct ceph_mds_client { - struct ceph_client *client; + struct ceph_fs_client *fsc; struct mutex mutex; /* all nested structures */ struct ceph_mdsmap *mdsmap; @@ -289,11 +289,6 @@ struct ceph_mds_client { int caps_avail_count; /* unused, unreserved */ int caps_min_count; /* keep at least this many (unreserved) */ - -#ifdef CONFIG_DEBUG_FS - struct dentry *debugfs_file; -#endif - spinlock_t dentry_lru_lock; struct list_head dentry_lru; int num_dentry; @@ -316,10 +311,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s); extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, struct ceph_msg *msg, int mds); -extern int ceph_mdsc_init(struct ceph_mds_client *mdsc, - struct ceph_client *client); +extern int ceph_mdsc_init(struct ceph_fs_client *fsc); extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); -extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc); +extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 040be6d1150b..73b7d44e8a35 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/bug.h> #include <linux/err.h> @@ -6,9 +6,9 @@ #include <linux/slab.h> #include <linux/types.h> -#include "mdsmap.h" -#include "messenger.h" -#include "decode.h" +#include <linux/ceph/mdsmap.h> +#include <linux/ceph/messenger.h> +#include <linux/ceph/decode.h> #include "super.h" @@ -117,7 +117,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) } dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", - i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr), + i+1, n, global_id, mds, inc, + ceph_pr_addr(&addr.in_addr), ceph_mds_state_name(state)); if (mds >= 0 && mds < m->m_max_mds && state > 0) { m->m_info[mds].global_id = global_id; diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c deleted file mode 100644 index 46a368b6dce5..000000000000 --- a/fs/ceph/pagelist.c +++ /dev/null @@ -1,63 +0,0 @@ - -#include <linux/gfp.h> -#include <linux/pagemap.h> -#include <linux/highmem.h> - -#include "pagelist.h" - -static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl) -{ - struct page *page = list_entry(pl->head.prev, struct page, - lru); - kunmap(page); -} - -int ceph_pagelist_release(struct ceph_pagelist *pl) -{ - if (pl->mapped_tail) - ceph_pagelist_unmap_tail(pl); - - while (!list_empty(&pl->head)) { - struct page *page = list_first_entry(&pl->head, struct page, - lru); - list_del(&page->lru); - __free_page(page); - } - return 0; -} - -static int ceph_pagelist_addpage(struct ceph_pagelist *pl) -{ - struct page *page = __page_cache_alloc(GFP_NOFS); - if (!page) - return -ENOMEM; - pl->room += PAGE_SIZE; - list_add_tail(&page->lru, &pl->head); - if (pl->mapped_tail) - ceph_pagelist_unmap_tail(pl); - pl->mapped_tail = kmap(page); - return 0; -} - -int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len) -{ - while (pl->room < len) { - size_t bit = pl->room; - int ret; - - memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), - buf, bit); - pl->length += bit; - pl->room -= bit; - buf += bit; - len -= bit; - ret = ceph_pagelist_addpage(pl); - if (ret) - return ret; - } - - memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len); - pl->length += len; - pl->room -= len; - return 0; -} diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 190b6c4a6f2b..39c243acd062 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -1,10 +1,12 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/sort.h> #include <linux/slab.h> #include "super.h" -#include "decode.h" +#include "mds_client.h" + +#include <linux/ceph/decode.h> /* * Snapshots in ceph are driven in large part by cooperation from the @@ -526,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { struct inode *inode = &ci->vfs_inode; - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; BUG_ON(capsnap->writing); capsnap->size = inode->i_size; @@ -747,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { - struct super_block *sb = mdsc->client->sb; + struct super_block *sb = mdsc->fsc->sb; int mds = session->s_mds; u64 split; int op; diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/strings.c index c6179d3a26a2..cd5097d7c804 100644 --- a/fs/ceph/ceph_strings.c +++ b/fs/ceph/strings.c @@ -1,71 +1,9 @@ /* - * Ceph string constants + * Ceph fs string constants */ -#include "types.h" +#include <linux/module.h> +#include <linux/ceph/types.h> -const char *ceph_entity_type_name(int type) -{ - switch (type) { - case CEPH_ENTITY_TYPE_MDS: return "mds"; - case CEPH_ENTITY_TYPE_OSD: return "osd"; - case CEPH_ENTITY_TYPE_MON: return "mon"; - case CEPH_ENTITY_TYPE_CLIENT: return "client"; - case CEPH_ENTITY_TYPE_AUTH: return "auth"; - default: return "unknown"; - } -} - -const char *ceph_osd_op_name(int op) -{ - switch (op) { - case CEPH_OSD_OP_READ: return "read"; - case CEPH_OSD_OP_STAT: return "stat"; - - case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; - - case CEPH_OSD_OP_WRITE: return "write"; - case CEPH_OSD_OP_DELETE: return "delete"; - case CEPH_OSD_OP_TRUNCATE: return "truncate"; - case CEPH_OSD_OP_ZERO: return "zero"; - case CEPH_OSD_OP_WRITEFULL: return "writefull"; - case CEPH_OSD_OP_ROLLBACK: return "rollback"; - - case CEPH_OSD_OP_APPEND: return "append"; - case CEPH_OSD_OP_STARTSYNC: return "startsync"; - case CEPH_OSD_OP_SETTRUNC: return "settrunc"; - case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc"; - - case CEPH_OSD_OP_TMAPUP: return "tmapup"; - case CEPH_OSD_OP_TMAPGET: return "tmapget"; - case CEPH_OSD_OP_TMAPPUT: return "tmapput"; - - case CEPH_OSD_OP_GETXATTR: return "getxattr"; - case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; - case CEPH_OSD_OP_SETXATTR: return "setxattr"; - case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; - case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; - case CEPH_OSD_OP_RMXATTR: return "rmxattr"; - case CEPH_OSD_OP_CMPXATTR: return "cmpxattr"; - - case CEPH_OSD_OP_PULL: return "pull"; - case CEPH_OSD_OP_PUSH: return "push"; - case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; - case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; - case CEPH_OSD_OP_SCRUB: return "scrub"; - - case CEPH_OSD_OP_WRLOCK: return "wrlock"; - case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; - case CEPH_OSD_OP_RDLOCK: return "rdlock"; - case CEPH_OSD_OP_RDUNLOCK: return "rdunlock"; - case CEPH_OSD_OP_UPLOCK: return "uplock"; - case CEPH_OSD_OP_DNLOCK: return "dnlock"; - - case CEPH_OSD_OP_CALL: return "call"; - - case CEPH_OSD_OP_PGLS: return "pgls"; - } - return "???"; -} const char *ceph_mds_state_name(int s) { @@ -177,17 +115,3 @@ const char *ceph_snap_op_name(int o) } return "???"; } - -const char *ceph_pool_op_name(int op) -{ - switch (op) { - case POOL_OP_CREATE: return "create"; - case POOL_OP_DELETE: return "delete"; - case POOL_OP_AUID_CHANGE: return "auid change"; - case POOL_OP_CREATE_SNAP: return "create snap"; - case POOL_OP_DELETE_SNAP: return "delete snap"; - case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap"; - case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap"; - } - return "???"; -} diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9922628532b2..d6e0e0421891 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1,5 +1,5 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/backing-dev.h> #include <linux/ctype.h> @@ -15,10 +15,13 @@ #include <linux/statfs.h> #include <linux/string.h> -#include "decode.h" #include "super.h" -#include "mon_client.h" -#include "auth.h" +#include "mds_client.h" + +#include <linux/ceph/decode.h> +#include <linux/ceph/mon_client.h> +#include <linux/ceph/auth.h> +#include <linux/ceph/debugfs.h> /* * Ceph superblock operations @@ -26,36 +29,22 @@ * Handle the basics of mounting, unmounting. */ - -/* - * find filename portion of a path (/foo/bar/baz -> baz) - */ -const char *ceph_file_part(const char *s, int len) -{ - const char *e = s + len; - - while (e != s && *(e-1) != '/') - e--; - return e; -} - - /* * super ops */ static void ceph_put_super(struct super_block *s) { - struct ceph_client *client = ceph_sb_to_client(s); + struct ceph_fs_client *fsc = ceph_sb_to_client(s); dout("put_super\n"); - ceph_mdsc_close_sessions(&client->mdsc); + ceph_mdsc_close_sessions(fsc->mdsc); /* * ensure we release the bdi before put_anon_super releases * the device name. */ - if (s->s_bdi == &client->backing_dev_info) { - bdi_unregister(&client->backing_dev_info); + if (s->s_bdi == &fsc->backing_dev_info) { + bdi_unregister(&fsc->backing_dev_info); s->s_bdi = NULL; } @@ -64,14 +53,14 @@ static void ceph_put_super(struct super_block *s) static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct ceph_client *client = ceph_inode_to_client(dentry->d_inode); - struct ceph_monmap *monmap = client->monc.monmap; + struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode); + struct ceph_monmap *monmap = fsc->client->monc.monmap; struct ceph_statfs st; u64 fsid; int err; dout("statfs\n"); - err = ceph_monc_do_statfs(&client->monc, &st); + err = ceph_monc_do_statfs(&fsc->client->monc, &st); if (err < 0) return err; @@ -104,238 +93,28 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) static int ceph_sync_fs(struct super_block *sb, int wait) { - struct ceph_client *client = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); if (!wait) { dout("sync_fs (non-blocking)\n"); - ceph_flush_dirty_caps(&client->mdsc); + ceph_flush_dirty_caps(fsc->mdsc); dout("sync_fs (non-blocking) done\n"); return 0; } dout("sync_fs (blocking)\n"); - ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc); - ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc); + ceph_osdc_sync(&fsc->client->osdc); + ceph_mdsc_sync(fsc->mdsc); dout("sync_fs (blocking) done\n"); return 0; } -static int default_congestion_kb(void) -{ - int congestion_kb; - - /* - * Copied from NFS - * - * congestion size, scale with available memory. - * - * 64MB: 8192k - * 128MB: 11585k - * 256MB: 16384k - * 512MB: 23170k - * 1GB: 32768k - * 2GB: 46340k - * 4GB: 65536k - * 8GB: 92681k - * 16GB: 131072k - * - * This allows larger machines to have larger/more transfers. - * Limit the default to 256M - */ - congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); - if (congestion_kb > 256*1024) - congestion_kb = 256*1024; - - return congestion_kb; -} - -/** - * ceph_show_options - Show mount options in /proc/mounts - * @m: seq_file to write to - * @mnt: mount descriptor - */ -static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) -{ - struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb); - struct ceph_mount_args *args = client->mount_args; - - if (args->flags & CEPH_OPT_FSID) - seq_printf(m, ",fsid=%pU", &args->fsid); - if (args->flags & CEPH_OPT_NOSHARE) - seq_puts(m, ",noshare"); - if (args->flags & CEPH_OPT_DIRSTAT) - seq_puts(m, ",dirstat"); - if ((args->flags & CEPH_OPT_RBYTES) == 0) - seq_puts(m, ",norbytes"); - if (args->flags & CEPH_OPT_NOCRC) - seq_puts(m, ",nocrc"); - if (args->flags & CEPH_OPT_NOASYNCREADDIR) - seq_puts(m, ",noasyncreaddir"); - - if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) - seq_printf(m, ",mount_timeout=%d", args->mount_timeout); - if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) - seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl); - if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) - seq_printf(m, ",osdtimeout=%d", args->osd_timeout); - if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) - seq_printf(m, ",osdkeepalivetimeout=%d", - args->osd_keepalive_timeout); - if (args->wsize) - seq_printf(m, ",wsize=%d", args->wsize); - if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT) - seq_printf(m, ",rsize=%d", args->rsize); - if (args->congestion_kb != default_congestion_kb()) - seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb); - if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) - seq_printf(m, ",caps_wanted_delay_min=%d", - args->caps_wanted_delay_min); - if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) - seq_printf(m, ",caps_wanted_delay_max=%d", - args->caps_wanted_delay_max); - if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) - seq_printf(m, ",cap_release_safety=%d", - args->cap_release_safety); - if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT) - seq_printf(m, ",readdir_max_entries=%d", args->max_readdir); - if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) - seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes); - if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) - seq_printf(m, ",snapdirname=%s", args->snapdir_name); - if (args->name) - seq_printf(m, ",name=%s", args->name); - if (args->secret) - seq_puts(m, ",secret=<hidden>"); - return 0; -} - -/* - * caches - */ -struct kmem_cache *ceph_inode_cachep; -struct kmem_cache *ceph_cap_cachep; -struct kmem_cache *ceph_dentry_cachep; -struct kmem_cache *ceph_file_cachep; - -static void ceph_inode_init_once(void *foo) -{ - struct ceph_inode_info *ci = foo; - inode_init_once(&ci->vfs_inode); -} - -static int __init init_caches(void) -{ - ceph_inode_cachep = kmem_cache_create("ceph_inode_info", - sizeof(struct ceph_inode_info), - __alignof__(struct ceph_inode_info), - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), - ceph_inode_init_once); - if (ceph_inode_cachep == NULL) - return -ENOMEM; - - ceph_cap_cachep = KMEM_CACHE(ceph_cap, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_cap_cachep == NULL) - goto bad_cap; - - ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_dentry_cachep == NULL) - goto bad_dentry; - - ceph_file_cachep = KMEM_CACHE(ceph_file_info, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_file_cachep == NULL) - goto bad_file; - - return 0; - -bad_file: - kmem_cache_destroy(ceph_dentry_cachep); -bad_dentry: - kmem_cache_destroy(ceph_cap_cachep); -bad_cap: - kmem_cache_destroy(ceph_inode_cachep); - return -ENOMEM; -} - -static void destroy_caches(void) -{ - kmem_cache_destroy(ceph_inode_cachep); - kmem_cache_destroy(ceph_cap_cachep); - kmem_cache_destroy(ceph_dentry_cachep); - kmem_cache_destroy(ceph_file_cachep); -} - - -/* - * ceph_umount_begin - initiate forced umount. Tear down down the - * mount, skipping steps that may hang while waiting for server(s). - */ -static void ceph_umount_begin(struct super_block *sb) -{ - struct ceph_client *client = ceph_sb_to_client(sb); - - dout("ceph_umount_begin - starting forced umount\n"); - if (!client) - return; - client->mount_state = CEPH_MOUNT_SHUTDOWN; - return; -} - -static const struct super_operations ceph_super_ops = { - .alloc_inode = ceph_alloc_inode, - .destroy_inode = ceph_destroy_inode, - .write_inode = ceph_write_inode, - .sync_fs = ceph_sync_fs, - .put_super = ceph_put_super, - .show_options = ceph_show_options, - .statfs = ceph_statfs, - .umount_begin = ceph_umount_begin, -}; - - -const char *ceph_msg_type_name(int type) -{ - switch (type) { - case CEPH_MSG_SHUTDOWN: return "shutdown"; - case CEPH_MSG_PING: return "ping"; - case CEPH_MSG_AUTH: return "auth"; - case CEPH_MSG_AUTH_REPLY: return "auth_reply"; - case CEPH_MSG_MON_MAP: return "mon_map"; - case CEPH_MSG_MON_GET_MAP: return "mon_get_map"; - case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe"; - case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack"; - case CEPH_MSG_STATFS: return "statfs"; - case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; - case CEPH_MSG_MDS_MAP: return "mds_map"; - case CEPH_MSG_CLIENT_SESSION: return "client_session"; - case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; - case CEPH_MSG_CLIENT_REQUEST: return "client_request"; - case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward"; - case CEPH_MSG_CLIENT_REPLY: return "client_reply"; - case CEPH_MSG_CLIENT_CAPS: return "client_caps"; - case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; - case CEPH_MSG_CLIENT_SNAP: return "client_snap"; - case CEPH_MSG_CLIENT_LEASE: return "client_lease"; - case CEPH_MSG_OSD_MAP: return "osd_map"; - case CEPH_MSG_OSD_OP: return "osd_op"; - case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; - default: return "unknown"; - } -} - - /* * mount options */ enum { Opt_wsize, Opt_rsize, - Opt_osdtimeout, - Opt_osdkeepalivetimeout, - Opt_mount_timeout, - Opt_osd_idle_ttl, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, Opt_cap_release_safety, @@ -344,29 +123,19 @@ enum { Opt_congestion_kb, Opt_last_int, /* int args above */ - Opt_fsid, Opt_snapdirname, - Opt_name, - Opt_secret, Opt_last_string, /* string args above */ - Opt_ip, - Opt_noshare, Opt_dirstat, Opt_nodirstat, Opt_rbytes, Opt_norbytes, - Opt_nocrc, Opt_noasyncreaddir, }; -static match_table_t arg_tokens = { +static match_table_t fsopt_tokens = { {Opt_wsize, "wsize=%d"}, {Opt_rsize, "rsize=%d"}, - {Opt_osdtimeout, "osdtimeout=%d"}, - {Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, - {Opt_mount_timeout, "mount_timeout=%d"}, - {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, {Opt_cap_release_safety, "cap_release_safety=%d"}, @@ -374,403 +143,459 @@ static match_table_t arg_tokens = { {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, {Opt_congestion_kb, "write_congestion_kb=%d"}, /* int args above */ - {Opt_fsid, "fsid=%s"}, {Opt_snapdirname, "snapdirname=%s"}, - {Opt_name, "name=%s"}, - {Opt_secret, "secret=%s"}, /* string args above */ - {Opt_ip, "ip=%s"}, - {Opt_noshare, "noshare"}, {Opt_dirstat, "dirstat"}, {Opt_nodirstat, "nodirstat"}, {Opt_rbytes, "rbytes"}, {Opt_norbytes, "norbytes"}, - {Opt_nocrc, "nocrc"}, {Opt_noasyncreaddir, "noasyncreaddir"}, {-1, NULL} }; -static int parse_fsid(const char *str, struct ceph_fsid *fsid) +static int parse_fsopt_token(char *c, void *private) { - int i = 0; - char tmp[3]; - int err = -EINVAL; - int d; - - dout("parse_fsid '%s'\n", str); - tmp[2] = 0; - while (*str && i < 16) { - if (ispunct(*str)) { - str++; - continue; + struct ceph_mount_options *fsopt = private; + substring_t argstr[MAX_OPT_ARGS]; + int token, intval, ret; + + token = match_token((char *)c, fsopt_tokens, argstr); + if (token < 0) + return -EINVAL; + + if (token < Opt_last_int) { + ret = match_int(&argstr[0], &intval); + if (ret < 0) { + pr_err("bad mount option arg (not int) " + "at '%s'\n", c); + return ret; } - if (!isxdigit(str[0]) || !isxdigit(str[1])) - break; - tmp[0] = str[0]; - tmp[1] = str[1]; - if (sscanf(tmp, "%x", &d) < 1) - break; - fsid->fsid[i] = d & 0xff; - i++; - str += 2; + dout("got int token %d val %d\n", token, intval); + } else if (token > Opt_last_int && token < Opt_last_string) { + dout("got string token %d val %s\n", token, + argstr[0].from); + } else { + dout("got token %d\n", token); } - if (i == 16) - err = 0; - dout("parse_fsid ret %d got fsid %pU", err, fsid); - return err; + switch (token) { + case Opt_snapdirname: + kfree(fsopt->snapdir_name); + fsopt->snapdir_name = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + if (!fsopt->snapdir_name) + return -ENOMEM; + break; + + /* misc */ + case Opt_wsize: + fsopt->wsize = intval; + break; + case Opt_rsize: + fsopt->rsize = intval; + break; + case Opt_caps_wanted_delay_min: + fsopt->caps_wanted_delay_min = intval; + break; + case Opt_caps_wanted_delay_max: + fsopt->caps_wanted_delay_max = intval; + break; + case Opt_readdir_max_entries: + fsopt->max_readdir = intval; + break; + case Opt_readdir_max_bytes: + fsopt->max_readdir_bytes = intval; + break; + case Opt_congestion_kb: + fsopt->congestion_kb = intval; + break; + case Opt_dirstat: + fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT; + break; + case Opt_nodirstat: + fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT; + break; + case Opt_rbytes: + fsopt->flags |= CEPH_MOUNT_OPT_RBYTES; + break; + case Opt_norbytes: + fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; + break; + case Opt_noasyncreaddir: + fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; + break; + default: + BUG_ON(token); + } + return 0; } -static struct ceph_mount_args *parse_mount_args(int flags, char *options, - const char *dev_name, - const char **path) +static void destroy_mount_options(struct ceph_mount_options *args) { - struct ceph_mount_args *args; - const char *c; - int err = -ENOMEM; - substring_t argstr[MAX_OPT_ARGS]; + dout("destroy_mount_options %p\n", args); + kfree(args->snapdir_name); + kfree(args); +} - args = kzalloc(sizeof(*args), GFP_KERNEL); - if (!args) - return ERR_PTR(-ENOMEM); - args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr), - GFP_KERNEL); - if (!args->mon_addr) - goto out; +static int strcmp_null(const char *s1, const char *s2) +{ + if (!s1 && !s2) + return 0; + if (s1 && !s2) + return -1; + if (!s1 && s2) + return 1; + return strcmp(s1, s2); +} - dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name); - - /* start with defaults */ - args->sb_flags = flags; - args->flags = CEPH_OPT_DEFAULT; - args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; - args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; - args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ - args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ - args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; - args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; - args->rsize = CEPH_MOUNT_RSIZE_DEFAULT; - args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); - args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; - args->max_readdir = CEPH_MAX_READDIR_DEFAULT; - args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; - args->congestion_kb = default_congestion_kb(); - - /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ - err = -EINVAL; - if (!dev_name) - goto out; - *path = strstr(dev_name, ":/"); - if (*path == NULL) { - pr_err("device name is missing path (no :/ in %s)\n", - dev_name); - goto out; - } +static int compare_mount_options(struct ceph_mount_options *new_fsopt, + struct ceph_options *new_opt, + struct ceph_fs_client *fsc) +{ + struct ceph_mount_options *fsopt1 = new_fsopt; + struct ceph_mount_options *fsopt2 = fsc->mount_options; + int ofs = offsetof(struct ceph_mount_options, snapdir_name); + int ret; - /* get mon ip(s) */ - err = ceph_parse_ips(dev_name, *path, args->mon_addr, - CEPH_MAX_MON, &args->num_mon); - if (err < 0) - goto out; + ret = memcmp(fsopt1, fsopt2, ofs); + if (ret) + return ret; + + ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name); + if (ret) + return ret; + + return ceph_compare_options(new_opt, fsc->client); +} + +static int parse_mount_options(struct ceph_mount_options **pfsopt, + struct ceph_options **popt, + int flags, char *options, + const char *dev_name, + const char **path) +{ + struct ceph_mount_options *fsopt; + const char *dev_name_end; + int err = -ENOMEM; + + fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL); + if (!fsopt) + return -ENOMEM; + + dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); + + fsopt->sb_flags = flags; + fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; + + fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT; + fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; + fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; + fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; + fsopt->congestion_kb = default_congestion_kb(); + + /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ + err = -EINVAL; + if (!dev_name) + goto out; + *path = strstr(dev_name, ":/"); + if (*path == NULL) { + pr_err("device name is missing path (no :/ in %s)\n", + dev_name); + goto out; + } + dev_name_end = *path; + dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); /* path on server */ *path += 2; dout("server path '%s'\n", *path); - /* parse mount options */ - while ((c = strsep(&options, ",")) != NULL) { - int token, intval, ret; - if (!*c) - continue; - err = -EINVAL; - token = match_token((char *)c, arg_tokens, argstr); - if (token < 0) { - pr_err("bad mount option at '%s'\n", c); - goto out; - } - if (token < Opt_last_int) { - ret = match_int(&argstr[0], &intval); - if (ret < 0) { - pr_err("bad mount option arg (not int) " - "at '%s'\n", c); - continue; - } - dout("got int token %d val %d\n", token, intval); - } else if (token > Opt_last_int && token < Opt_last_string) { - dout("got string token %d val %s\n", token, - argstr[0].from); - } else { - dout("got token %d\n", token); - } - switch (token) { - case Opt_ip: - err = ceph_parse_ips(argstr[0].from, - argstr[0].to, - &args->my_addr, - 1, NULL); - if (err < 0) - goto out; - args->flags |= CEPH_OPT_MYIP; - break; - - case Opt_fsid: - err = parse_fsid(argstr[0].from, &args->fsid); - if (err == 0) - args->flags |= CEPH_OPT_FSID; - break; - case Opt_snapdirname: - kfree(args->snapdir_name); - args->snapdir_name = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - break; - case Opt_name: - args->name = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - break; - case Opt_secret: - args->secret = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - break; - - /* misc */ - case Opt_wsize: - args->wsize = intval; - break; - case Opt_rsize: - args->rsize = intval; - break; - case Opt_osdtimeout: - args->osd_timeout = intval; - break; - case Opt_osdkeepalivetimeout: - args->osd_keepalive_timeout = intval; - break; - case Opt_osd_idle_ttl: - args->osd_idle_ttl = intval; - break; - case Opt_mount_timeout: - args->mount_timeout = intval; - break; - case Opt_caps_wanted_delay_min: - args->caps_wanted_delay_min = intval; - break; - case Opt_caps_wanted_delay_max: - args->caps_wanted_delay_max = intval; - break; - case Opt_readdir_max_entries: - args->max_readdir = intval; - break; - case Opt_readdir_max_bytes: - args->max_readdir_bytes = intval; - break; - case Opt_congestion_kb: - args->congestion_kb = intval; - break; - - case Opt_noshare: - args->flags |= CEPH_OPT_NOSHARE; - break; - - case Opt_dirstat: - args->flags |= CEPH_OPT_DIRSTAT; - break; - case Opt_nodirstat: - args->flags &= ~CEPH_OPT_DIRSTAT; - break; - case Opt_rbytes: - args->flags |= CEPH_OPT_RBYTES; - break; - case Opt_norbytes: - args->flags &= ~CEPH_OPT_RBYTES; - break; - case Opt_nocrc: - args->flags |= CEPH_OPT_NOCRC; - break; - case Opt_noasyncreaddir: - args->flags |= CEPH_OPT_NOASYNCREADDIR; - break; - - default: - BUG_ON(token); - } - } - return args; + err = ceph_parse_options(popt, options, dev_name, dev_name_end, + parse_fsopt_token, (void *)fsopt); + if (err) + goto out; + + /* success */ + *pfsopt = fsopt; + return 0; out: - kfree(args->mon_addr); - kfree(args); - return ERR_PTR(err); + destroy_mount_options(fsopt); + return err; } -static void destroy_mount_args(struct ceph_mount_args *args) +/** + * ceph_show_options - Show mount options in /proc/mounts + * @m: seq_file to write to + * @mnt: mount descriptor + */ +static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) { - dout("destroy_mount_args %p\n", args); - kfree(args->snapdir_name); - args->snapdir_name = NULL; - kfree(args->name); - args->name = NULL; - kfree(args->secret); - args->secret = NULL; - kfree(args); + struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb); + struct ceph_mount_options *fsopt = fsc->mount_options; + struct ceph_options *opt = fsc->client->options; + + if (opt->flags & CEPH_OPT_FSID) + seq_printf(m, ",fsid=%pU", &opt->fsid); + if (opt->flags & CEPH_OPT_NOSHARE) + seq_puts(m, ",noshare"); + if (opt->flags & CEPH_OPT_NOCRC) + seq_puts(m, ",nocrc"); + + if (opt->name) + seq_printf(m, ",name=%s", opt->name); + if (opt->secret) + seq_puts(m, ",secret=<hidden>"); + + if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) + seq_printf(m, ",mount_timeout=%d", opt->mount_timeout); + if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) + seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); + if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) + seq_printf(m, ",osdtimeout=%d", opt->osd_timeout); + if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) + seq_printf(m, ",osdkeepalivetimeout=%d", + opt->osd_keepalive_timeout); + + if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) + seq_puts(m, ",dirstat"); + if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0) + seq_puts(m, ",norbytes"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) + seq_puts(m, ",noasyncreaddir"); + + if (fsopt->wsize) + seq_printf(m, ",wsize=%d", fsopt->wsize); + if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT) + seq_printf(m, ",rsize=%d", fsopt->rsize); + if (fsopt->congestion_kb != default_congestion_kb()) + seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); + if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) + seq_printf(m, ",caps_wanted_delay_min=%d", + fsopt->caps_wanted_delay_min); + if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) + seq_printf(m, ",caps_wanted_delay_max=%d", + fsopt->caps_wanted_delay_max); + if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) + seq_printf(m, ",cap_release_safety=%d", + fsopt->cap_release_safety); + if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) + seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir); + if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) + seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); + if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) + seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); + return 0; } /* - * create a fresh client instance + * handle any mon messages the standard library doesn't understand. + * return error if we don't either. */ -static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) +static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg) { - struct ceph_client *client; + struct ceph_fs_client *fsc = client->private; + int type = le16_to_cpu(msg->hdr.type); + + switch (type) { + case CEPH_MSG_MDS_MAP: + ceph_mdsc_handle_map(fsc->mdsc, msg); + return 0; + + default: + return -1; + } +} + +/* + * create a new fs client + */ +struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, + struct ceph_options *opt) +{ + struct ceph_fs_client *fsc; int err = -ENOMEM; - client = kzalloc(sizeof(*client), GFP_KERNEL); - if (client == NULL) + fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); + if (!fsc) return ERR_PTR(-ENOMEM); - mutex_init(&client->mount_mutex); - - init_waitqueue_head(&client->auth_wq); + fsc->client = ceph_create_client(opt, fsc); + if (IS_ERR(fsc->client)) { + err = PTR_ERR(fsc->client); + goto fail; + } + fsc->client->extra_mon_dispatch = extra_mon_dispatch; + fsc->client->supported_features |= CEPH_FEATURE_FLOCK; + fsc->client->monc.want_mdsmap = 1; - client->sb = NULL; - client->mount_state = CEPH_MOUNT_MOUNTING; - client->mount_args = args; + fsc->mount_options = fsopt; - client->msgr = NULL; + fsc->sb = NULL; + fsc->mount_state = CEPH_MOUNT_MOUNTING; - client->auth_err = 0; - atomic_long_set(&client->writeback_count, 0); + atomic_long_set(&fsc->writeback_count, 0); - err = bdi_init(&client->backing_dev_info); + err = bdi_init(&fsc->backing_dev_info); if (err < 0) - goto fail; + goto fail_client; err = -ENOMEM; - client->wb_wq = create_workqueue("ceph-writeback"); - if (client->wb_wq == NULL) + fsc->wb_wq = create_workqueue("ceph-writeback"); + if (fsc->wb_wq == NULL) goto fail_bdi; - client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); - if (client->pg_inv_wq == NULL) + fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); + if (fsc->pg_inv_wq == NULL) goto fail_wb_wq; - client->trunc_wq = create_singlethread_workqueue("ceph-trunc"); - if (client->trunc_wq == NULL) + fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc"); + if (fsc->trunc_wq == NULL) goto fail_pg_inv_wq; /* set up mempools */ err = -ENOMEM; - client->wb_pagevec_pool = mempool_create_kmalloc_pool(10, - client->mount_args->wsize >> PAGE_CACHE_SHIFT); - if (!client->wb_pagevec_pool) + fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, + fsc->mount_options->wsize >> PAGE_CACHE_SHIFT); + if (!fsc->wb_pagevec_pool) goto fail_trunc_wq; /* caps */ - client->min_caps = args->max_readdir; + fsc->min_caps = fsopt->max_readdir; + + return fsc; - /* subsystems */ - err = ceph_monc_init(&client->monc, client); - if (err < 0) - goto fail_mempool; - err = ceph_osdc_init(&client->osdc, client); - if (err < 0) - goto fail_monc; - err = ceph_mdsc_init(&client->mdsc, client); - if (err < 0) - goto fail_osdc; - return client; - -fail_osdc: - ceph_osdc_stop(&client->osdc); -fail_monc: - ceph_monc_stop(&client->monc); -fail_mempool: - mempool_destroy(client->wb_pagevec_pool); fail_trunc_wq: - destroy_workqueue(client->trunc_wq); + destroy_workqueue(fsc->trunc_wq); fail_pg_inv_wq: - destroy_workqueue(client->pg_inv_wq); + destroy_workqueue(fsc->pg_inv_wq); fail_wb_wq: - destroy_workqueue(client->wb_wq); + destroy_workqueue(fsc->wb_wq); fail_bdi: - bdi_destroy(&client->backing_dev_info); + bdi_destroy(&fsc->backing_dev_info); +fail_client: + ceph_destroy_client(fsc->client); fail: - kfree(client); + kfree(fsc); return ERR_PTR(err); } -static void ceph_destroy_client(struct ceph_client *client) +void destroy_fs_client(struct ceph_fs_client *fsc) { - dout("destroy_client %p\n", client); + dout("destroy_fs_client %p\n", fsc); - /* unmount */ - ceph_mdsc_stop(&client->mdsc); - ceph_osdc_stop(&client->osdc); + destroy_workqueue(fsc->wb_wq); + destroy_workqueue(fsc->pg_inv_wq); + destroy_workqueue(fsc->trunc_wq); - /* - * make sure mds and osd connections close out before destroying - * the auth module, which is needed to free those connections' - * ceph_authorizers. - */ - ceph_msgr_flush(); - - ceph_monc_stop(&client->monc); + bdi_destroy(&fsc->backing_dev_info); - ceph_debugfs_client_cleanup(client); - destroy_workqueue(client->wb_wq); - destroy_workqueue(client->pg_inv_wq); - destroy_workqueue(client->trunc_wq); + mempool_destroy(fsc->wb_pagevec_pool); - bdi_destroy(&client->backing_dev_info); + destroy_mount_options(fsc->mount_options); - if (client->msgr) - ceph_messenger_destroy(client->msgr); - mempool_destroy(client->wb_pagevec_pool); + ceph_fs_debugfs_cleanup(fsc); - destroy_mount_args(client->mount_args); + ceph_destroy_client(fsc->client); - kfree(client); - dout("destroy_client %p done\n", client); + kfree(fsc); + dout("destroy_fs_client %p done\n", fsc); } /* - * Initially learn our fsid, or verify an fsid matches. + * caches */ -int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) +struct kmem_cache *ceph_inode_cachep; +struct kmem_cache *ceph_cap_cachep; +struct kmem_cache *ceph_dentry_cachep; +struct kmem_cache *ceph_file_cachep; + +static void ceph_inode_init_once(void *foo) { - if (client->have_fsid) { - if (ceph_fsid_compare(&client->fsid, fsid)) { - pr_err("bad fsid, had %pU got %pU", - &client->fsid, fsid); - return -1; - } - } else { - pr_info("client%lld fsid %pU\n", client->monc.auth->global_id, - fsid); - memcpy(&client->fsid, fsid, sizeof(*fsid)); - ceph_debugfs_client_init(client); - client->have_fsid = true; - } + struct ceph_inode_info *ci = foo; + inode_init_once(&ci->vfs_inode); +} + +static int __init init_caches(void) +{ + ceph_inode_cachep = kmem_cache_create("ceph_inode_info", + sizeof(struct ceph_inode_info), + __alignof__(struct ceph_inode_info), + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + ceph_inode_init_once); + if (ceph_inode_cachep == NULL) + return -ENOMEM; + + ceph_cap_cachep = KMEM_CACHE(ceph_cap, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_cap_cachep == NULL) + goto bad_cap; + + ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_dentry_cachep == NULL) + goto bad_dentry; + + ceph_file_cachep = KMEM_CACHE(ceph_file_info, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_file_cachep == NULL) + goto bad_file; + return 0; + +bad_file: + kmem_cache_destroy(ceph_dentry_cachep); +bad_dentry: + kmem_cache_destroy(ceph_cap_cachep); +bad_cap: + kmem_cache_destroy(ceph_inode_cachep); + return -ENOMEM; } +static void destroy_caches(void) +{ + kmem_cache_destroy(ceph_inode_cachep); + kmem_cache_destroy(ceph_cap_cachep); + kmem_cache_destroy(ceph_dentry_cachep); + kmem_cache_destroy(ceph_file_cachep); +} + + /* - * true if we have the mon map (and have thus joined the cluster) + * ceph_umount_begin - initiate forced umount. Tear down down the + * mount, skipping steps that may hang while waiting for server(s). */ -static int have_mon_and_osd_map(struct ceph_client *client) +static void ceph_umount_begin(struct super_block *sb) { - return client->monc.monmap && client->monc.monmap->epoch && - client->osdc.osdmap && client->osdc.osdmap->epoch; + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + + dout("ceph_umount_begin - starting forced umount\n"); + if (!fsc) + return; + fsc->mount_state = CEPH_MOUNT_SHUTDOWN; + return; } +static const struct super_operations ceph_super_ops = { + .alloc_inode = ceph_alloc_inode, + .destroy_inode = ceph_destroy_inode, + .write_inode = ceph_write_inode, + .sync_fs = ceph_sync_fs, + .put_super = ceph_put_super, + .show_options = ceph_show_options, + .statfs = ceph_statfs, + .umount_begin = ceph_umount_begin, +}; + /* * Bootstrap mount by opening the root directory. Note the mount * @started time from caller, and time out if this takes too long. */ -static struct dentry *open_root_dentry(struct ceph_client *client, +static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, const char *path, unsigned long started) { - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req = NULL; int err; struct dentry *root; @@ -784,14 +609,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client, req->r_ino1.ino = CEPH_INO_ROOT; req->r_ino1.snap = CEPH_NOSNAP; req->r_started = started; - req->r_timeout = client->mount_args->mount_timeout * HZ; + req->r_timeout = fsc->client->options->mount_timeout * HZ; req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err == 0) { dout("open_root_inode success\n"); if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && - client->sb->s_root == NULL) + fsc->sb->s_root == NULL) root = d_alloc_root(req->r_target_inode); else root = d_obtain_alias(req->r_target_inode); @@ -804,105 +629,86 @@ static struct dentry *open_root_dentry(struct ceph_client *client, return root; } + + + /* * mount: join the ceph cluster, and open root directory. */ -static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, +static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt, const char *path) { - struct ceph_entity_addr *myaddr = NULL; int err; - unsigned long timeout = client->mount_args->mount_timeout * HZ; unsigned long started = jiffies; /* note the start time */ struct dentry *root; + int first = 0; /* first vfsmount for this super_block */ dout("mount start\n"); - mutex_lock(&client->mount_mutex); - - /* initialize the messenger */ - if (client->msgr == NULL) { - if (ceph_test_opt(client, MYIP)) - myaddr = &client->mount_args->my_addr; - client->msgr = ceph_messenger_create(myaddr); - if (IS_ERR(client->msgr)) { - err = PTR_ERR(client->msgr); - client->msgr = NULL; - goto out; - } - client->msgr->nocrc = ceph_test_opt(client, NOCRC); - } + mutex_lock(&fsc->client->mount_mutex); - /* open session, and wait for mon, mds, and osd maps */ - err = ceph_monc_open_session(&client->monc); + err = __ceph_open_session(fsc->client, started); if (err < 0) goto out; - while (!have_mon_and_osd_map(client)) { - err = -EIO; - if (timeout && time_after_eq(jiffies, started + timeout)) - goto out; - - /* wait */ - dout("mount waiting for mon_map\n"); - err = wait_event_interruptible_timeout(client->auth_wq, - have_mon_and_osd_map(client) || (client->auth_err < 0), - timeout); - if (err == -EINTR || err == -ERESTARTSYS) - goto out; - if (client->auth_err < 0) { - err = client->auth_err; - goto out; - } - } - dout("mount opening root\n"); - root = open_root_dentry(client, "", started); + root = open_root_dentry(fsc, "", started); if (IS_ERR(root)) { err = PTR_ERR(root); goto out; } - if (client->sb->s_root) + if (fsc->sb->s_root) { dput(root); - else - client->sb->s_root = root; + } else { + fsc->sb->s_root = root; + first = 1; + + err = ceph_fs_debugfs_init(fsc); + if (err < 0) + goto fail; + } if (path[0] == 0) { dget(root); } else { dout("mount opening base mountpoint\n"); - root = open_root_dentry(client, path, started); + root = open_root_dentry(fsc, path, started); if (IS_ERR(root)) { err = PTR_ERR(root); - dput(client->sb->s_root); - client->sb->s_root = NULL; - goto out; + goto fail; } } mnt->mnt_root = root; - mnt->mnt_sb = client->sb; + mnt->mnt_sb = fsc->sb; - client->mount_state = CEPH_MOUNT_MOUNTED; + fsc->mount_state = CEPH_MOUNT_MOUNTED; dout("mount success\n"); err = 0; out: - mutex_unlock(&client->mount_mutex); + mutex_unlock(&fsc->client->mount_mutex); return err; + +fail: + if (first) { + dput(fsc->sb->s_root); + fsc->sb->s_root = NULL; + } + goto out; } static int ceph_set_super(struct super_block *s, void *data) { - struct ceph_client *client = data; + struct ceph_fs_client *fsc = data; int ret; dout("set_super %p data %p\n", s, data); - s->s_flags = client->mount_args->sb_flags; + s->s_flags = fsc->mount_options->sb_flags; s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ - s->s_fs_info = client; - client->sb = s; + s->s_fs_info = fsc; + fsc->sb = s; s->s_op = &ceph_super_ops; s->s_export_op = &ceph_export_ops; @@ -917,7 +723,7 @@ static int ceph_set_super(struct super_block *s, void *data) fail: s->s_fs_info = NULL; - client->sb = NULL; + fsc->sb = NULL; return ret; } @@ -926,30 +732,23 @@ fail: */ static int ceph_compare_super(struct super_block *sb, void *data) { - struct ceph_client *new = data; - struct ceph_mount_args *args = new->mount_args; - struct ceph_client *other = ceph_sb_to_client(sb); - int i; + struct ceph_fs_client *new = data; + struct ceph_mount_options *fsopt = new->mount_options; + struct ceph_options *opt = new->client->options; + struct ceph_fs_client *other = ceph_sb_to_client(sb); dout("ceph_compare_super %p\n", sb); - if (args->flags & CEPH_OPT_FSID) { - if (ceph_fsid_compare(&args->fsid, &other->fsid)) { - dout("fsid doesn't match\n"); - return 0; - } - } else { - /* do we share (a) monitor? */ - for (i = 0; i < new->monc.monmap->num_mon; i++) - if (ceph_monmap_contains(other->monc.monmap, - &new->monc.monmap->mon_inst[i].addr)) - break; - if (i == new->monc.monmap->num_mon) { - dout("mon ip not part of monmap\n"); - return 0; - } - dout("mon ip matches existing sb %p\n", sb); + + if (compare_mount_options(fsopt, opt, other)) { + dout("monitor(s)/mount options don't match\n"); + return 0; } - if (args->sb_flags != other->mount_args->sb_flags) { + if ((opt->flags & CEPH_OPT_FSID) && + ceph_fsid_compare(&opt->fsid, &other->client->fsid)) { + dout("fsid doesn't match\n"); + return 0; + } + if (fsopt->sb_flags != other->mount_options->sb_flags) { dout("flags differ\n"); return 0; } @@ -961,19 +760,20 @@ static int ceph_compare_super(struct super_block *sb, void *data) */ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); -static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) +static int ceph_register_bdi(struct super_block *sb, + struct ceph_fs_client *fsc) { int err; /* set ra_pages based on rsize mount option? */ - if (client->mount_args->rsize >= PAGE_CACHE_SIZE) - client->backing_dev_info.ra_pages = - (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) + if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) + fsc->backing_dev_info.ra_pages = + (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; - err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d", + err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d", atomic_long_inc_return(&bdi_seq)); if (!err) - sb->s_bdi = &client->backing_dev_info; + sb->s_bdi = &fsc->backing_dev_info; return err; } @@ -982,46 +782,52 @@ static int ceph_get_sb(struct file_system_type *fs_type, struct vfsmount *mnt) { struct super_block *sb; - struct ceph_client *client; + struct ceph_fs_client *fsc; int err; int (*compare_super)(struct super_block *, void *) = ceph_compare_super; const char *path = NULL; - struct ceph_mount_args *args; + struct ceph_mount_options *fsopt = NULL; + struct ceph_options *opt = NULL; dout("ceph_get_sb\n"); - args = parse_mount_args(flags, data, dev_name, &path); - if (IS_ERR(args)) { - err = PTR_ERR(args); + err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); + if (err < 0) goto out_final; - } /* create client (which we may/may not use) */ - client = ceph_create_client(args); - if (IS_ERR(client)) { - err = PTR_ERR(client); + fsc = create_fs_client(fsopt, opt); + if (IS_ERR(fsc)) { + err = PTR_ERR(fsc); + kfree(fsopt); + kfree(opt); goto out_final; } - if (client->mount_args->flags & CEPH_OPT_NOSHARE) + err = ceph_mdsc_init(fsc); + if (err < 0) + goto out; + + if (ceph_test_opt(fsc->client, NOSHARE)) compare_super = NULL; - sb = sget(fs_type, compare_super, ceph_set_super, client); + sb = sget(fs_type, compare_super, ceph_set_super, fsc); if (IS_ERR(sb)) { err = PTR_ERR(sb); goto out; } - if (ceph_sb_to_client(sb) != client) { - ceph_destroy_client(client); - client = ceph_sb_to_client(sb); - dout("get_sb got existing client %p\n", client); + if (ceph_sb_to_client(sb) != fsc) { + ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); + fsc = ceph_sb_to_client(sb); + dout("get_sb got existing client %p\n", fsc); } else { - dout("get_sb using new client %p\n", client); - err = ceph_register_bdi(sb, client); + dout("get_sb using new client %p\n", fsc); + err = ceph_register_bdi(sb, fsc); if (err < 0) goto out_splat; } - err = ceph_mount(client, mnt, path); + err = ceph_mount(fsc, mnt, path); if (err < 0) goto out_splat; dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root, @@ -1029,12 +835,13 @@ static int ceph_get_sb(struct file_system_type *fs_type, return 0; out_splat: - ceph_mdsc_close_sessions(&client->mdsc); + ceph_mdsc_close_sessions(fsc->mdsc); deactivate_locked_super(sb); goto out_final; out: - ceph_destroy_client(client); + ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); out_final: dout("ceph_get_sb fail %d\n", err); return err; @@ -1042,11 +849,12 @@ out_final: static void ceph_kill_sb(struct super_block *s) { - struct ceph_client *client = ceph_sb_to_client(s); + struct ceph_fs_client *fsc = ceph_sb_to_client(s); dout("kill_sb %p\n", s); - ceph_mdsc_pre_umount(&client->mdsc); + ceph_mdsc_pre_umount(fsc->mdsc); kill_anon_super(s); /* will call put_super after sb is r/o */ - ceph_destroy_client(client); + ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); } static struct file_system_type ceph_fs_type = { @@ -1062,36 +870,20 @@ static struct file_system_type ceph_fs_type = { static int __init init_ceph(void) { - int ret = 0; - - ret = ceph_debugfs_init(); - if (ret < 0) - goto out; - - ret = ceph_msgr_init(); - if (ret < 0) - goto out_debugfs; - - ret = init_caches(); + int ret = init_caches(); if (ret) - goto out_msgr; + goto out; ret = register_filesystem(&ceph_fs_type); if (ret) goto out_icache; - pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n", - CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL, - CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, - CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); + pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); + return 0; out_icache: destroy_caches(); -out_msgr: - ceph_msgr_exit(); -out_debugfs: - ceph_debugfs_cleanup(); out: return ret; } @@ -1101,8 +893,6 @@ static void __exit exit_ceph(void) dout("exit_ceph\n"); unregister_filesystem(&ceph_fs_type); destroy_caches(); - ceph_msgr_exit(); - ceph_debugfs_cleanup(); } module_init(init_ceph); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b87638e84c4b..1886294e12f7 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1,7 +1,7 @@ #ifndef _FS_CEPH_SUPER_H #define _FS_CEPH_SUPER_H -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <asm/unaligned.h> #include <linux/backing-dev.h> @@ -14,13 +14,7 @@ #include <linux/writeback.h> #include <linux/slab.h> -#include "types.h" -#include "messenger.h" -#include "msgpool.h" -#include "mon_client.h" -#include "mds_client.h" -#include "osd_client.h" -#include "ceph_fs.h" +#include <linux/ceph/libceph.h> /* f_type in struct statfs */ #define CEPH_SUPER_MAGIC 0x00c36400 @@ -30,42 +24,25 @@ #define CEPH_BLOCK_SHIFT 20 /* 1 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) -/* - * Supported features - */ -#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK -#define CEPH_FEATURE_REQUIRED CEPH_FEATURE_NOSRCADDR +#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ +#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ +#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ -/* - * mount options - */ -#define CEPH_OPT_FSID (1<<0) -#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */ -#define CEPH_OPT_MYIP (1<<2) /* specified my ip */ -#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */ -#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ -#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */ -#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ +#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) -#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES) +#define ceph_set_mount_opt(fsc, opt) \ + (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; +#define ceph_test_mount_opt(fsc, opt) \ + (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) -#define ceph_set_opt(client, opt) \ - (client)->mount_args->flags |= CEPH_OPT_##opt; -#define ceph_test_opt(client, opt) \ - (!!((client)->mount_args->flags & CEPH_OPT_##opt)) +#define CEPH_MAX_READDIR_DEFAULT 1024 +#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) +#define CEPH_SNAPDIRNAME_DEFAULT ".snap" - -struct ceph_mount_args { - int sb_flags; +struct ceph_mount_options { int flags; - struct ceph_fsid fsid; - struct ceph_entity_addr my_addr; - int num_mon; - struct ceph_entity_addr *mon_addr; - int mount_timeout; - int osd_idle_ttl; - int osd_timeout; - int osd_keepalive_timeout; + int sb_flags; + int wsize; int rsize; /* max readahead */ int congestion_kb; /* max writeback in flight */ @@ -73,82 +50,25 @@ struct ceph_mount_args { int cap_release_safety; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ - char *snapdir_name; /* default ".snap" */ - char *name; - char *secret; -}; -/* - * defaults - */ -#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 -#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */ -#define CEPH_OSD_KEEPALIVE_DEFAULT 5 -#define CEPH_OSD_IDLE_TTL_DEFAULT 60 -#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ -#define CEPH_MAX_READDIR_DEFAULT 1024 -#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) - -#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) -#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) - -#define CEPH_SNAPDIRNAME_DEFAULT ".snap" -#define CEPH_AUTH_NAME_DEFAULT "guest" -/* - * Delay telling the MDS we no longer want caps, in case we reopen - * the file. Delay a minimum amount of time, even if we send a cap - * message for some other reason. Otherwise, take the oppotunity to - * update the mds to avoid sending another message later. - */ -#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ -#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ - -#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4) - -/* mount state */ -enum { - CEPH_MOUNT_MOUNTING, - CEPH_MOUNT_MOUNTED, - CEPH_MOUNT_UNMOUNTING, - CEPH_MOUNT_UNMOUNTED, - CEPH_MOUNT_SHUTDOWN, -}; - -/* - * subtract jiffies - */ -static inline unsigned long time_sub(unsigned long a, unsigned long b) -{ - BUG_ON(time_after(b, a)); - return (long)a - (long)b; -} - -/* - * per-filesystem client state - * - * possibly shared by multiple mount points, if they are - * mounting the same ceph filesystem/cluster. - */ -struct ceph_client { - struct ceph_fsid fsid; - bool have_fsid; + /* + * everything above this point can be memcmp'd; everything below + * is handled in compare_mount_options() + */ - struct mutex mount_mutex; /* serialize mount attempts */ - struct ceph_mount_args *mount_args; + char *snapdir_name; /* default ".snap" */ +}; +struct ceph_fs_client { struct super_block *sb; - unsigned long mount_state; - wait_queue_head_t auth_wq; - - int auth_err; + struct ceph_mount_options *mount_options; + struct ceph_client *client; + unsigned long mount_state; int min_caps; /* min caps i added */ - struct ceph_messenger *msgr; /* messenger instance */ - struct ceph_mon_client monc; - struct ceph_mds_client mdsc; - struct ceph_osd_client osdc; + struct ceph_mds_client *mdsc; /* writeback */ mempool_t *wb_pagevec_pool; @@ -160,14 +80,14 @@ struct ceph_client { struct backing_dev_info backing_dev_info; #ifdef CONFIG_DEBUG_FS - struct dentry *debugfs_monmap; - struct dentry *debugfs_mdsmap, *debugfs_osdmap; - struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps; + struct dentry *debugfs_dentry_lru, *debugfs_caps; struct dentry *debugfs_congestion_kb; struct dentry *debugfs_bdi; + struct dentry *debugfs_mdsc, *debugfs_mdsmap; #endif }; + /* * File i/o capability. This tracks shared state with the metadata * server that allows us to cache or writeback attributes or to read @@ -275,6 +195,20 @@ struct ceph_inode_xattr { int should_free_val; }; +/* + * Ceph dentry state + */ +struct ceph_dentry_info { + struct ceph_mds_session *lease_session; + u32 lease_gen, lease_shared_gen; + u32 lease_seq; + unsigned long lease_renew_after, lease_renew_from; + struct list_head lru; + struct dentry *dentry; + u64 time; + u64 offset; +}; + struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing @@ -296,11 +230,6 @@ struct ceph_inode_xattrs_info { /* * Ceph inode. */ -#define CEPH_I_COMPLETE 1 /* we have complete directory cached */ -#define CEPH_I_NODELAY 4 /* do not delay cap release */ -#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ -#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ - struct ceph_inode_info { struct ceph_vino i_vino; /* ceph ino + snap */ @@ -391,6 +320,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode) return container_of(inode, struct ceph_inode_info, vfs_inode); } +static inline struct ceph_vino ceph_vino(struct inode *inode) +{ + return ceph_inode(inode)->i_vino; +} + +/* + * ino_t is <64 bits on many architectures, blech. + * + * don't include snap in ino hash, at least for now. + */ +static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) +{ + ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ +#if BITS_PER_LONG == 32 + ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8; + if (!ino) + ino = 1; +#endif + return ino; +} + +/* for printf-style formatting */ +#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap + +static inline u64 ceph_ino(struct inode *inode) +{ + return ceph_inode(inode)->i_vino.ino; +} +static inline u64 ceph_snap(struct inode *inode) +{ + return ceph_inode(inode)->i_vino.snap; +} + +static inline int ceph_ino_compare(struct inode *inode, void *data) +{ + struct ceph_vino *pvino = (struct ceph_vino *)data; + struct ceph_inode_info *ci = ceph_inode(inode); + return ci->i_vino.ino == pvino->ino && + ci->i_vino.snap == pvino->snap; +} + +static inline struct inode *ceph_find_inode(struct super_block *sb, + struct ceph_vino vino) +{ + ino_t t = ceph_vino_to_ino(vino); + return ilookup5(sb, t, ceph_ino_compare, &vino); +} + + +/* + * Ceph inode. + */ +#define CEPH_I_COMPLETE 1 /* we have complete directory cached */ +#define CEPH_I_NODELAY 4 /* do not delay cap release */ +#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ +#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ + static inline void ceph_i_clear(struct inode *inode, unsigned mask) { struct ceph_inode_info *ci = ceph_inode(inode); @@ -414,8 +400,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask) struct ceph_inode_info *ci = ceph_inode(inode); bool r; - smp_mb(); + spin_lock(&inode->i_lock); r = (ci->i_ceph_flags & mask) == mask; + spin_unlock(&inode->i_lock); return r; } @@ -432,20 +419,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, struct ceph_inode_frag *pfrag, int *found); -/* - * Ceph dentry state - */ -struct ceph_dentry_info { - struct ceph_mds_session *lease_session; - u32 lease_gen, lease_shared_gen; - u32 lease_seq; - unsigned long lease_renew_after, lease_renew_from; - struct list_head lru; - struct dentry *dentry; - u64 time; - u64 offset; -}; - static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) { return (struct ceph_dentry_info *)dentry->d_fsdata; @@ -456,22 +429,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) return ((loff_t)frag << 32) | (loff_t)off; } -/* - * ino_t is <64 bits on many architectures, blech. - * - * don't include snap in ino hash, at least for now. - */ -static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) -{ - ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ -#if BITS_PER_LONG == 32 - ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8; - if (!ino) - ino = 1; -#endif - return ino; -} - static inline int ceph_set_ino_cb(struct inode *inode, void *data) { ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; @@ -479,39 +436,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data) return 0; } -static inline struct ceph_vino ceph_vino(struct inode *inode) -{ - return ceph_inode(inode)->i_vino; -} - -/* for printf-style formatting */ -#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap - -static inline u64 ceph_ino(struct inode *inode) -{ - return ceph_inode(inode)->i_vino.ino; -} -static inline u64 ceph_snap(struct inode *inode) -{ - return ceph_inode(inode)->i_vino.snap; -} - -static inline int ceph_ino_compare(struct inode *inode, void *data) -{ - struct ceph_vino *pvino = (struct ceph_vino *)data; - struct ceph_inode_info *ci = ceph_inode(inode); - return ci->i_vino.ino == pvino->ino && - ci->i_vino.snap == pvino->snap; -} - -static inline struct inode *ceph_find_inode(struct super_block *sb, - struct ceph_vino vino) -{ - ino_t t = ceph_vino_to_ino(vino); - return ilookup5(sb, t, ceph_ino_compare, &vino); -} - - /* * caps helpers */ @@ -576,18 +500,18 @@ extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need); extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx); -extern void ceph_reservation_status(struct ceph_client *client, +extern void ceph_reservation_status(struct ceph_fs_client *client, int *total, int *avail, int *used, int *reserved, int *min); -static inline struct ceph_client *ceph_inode_to_client(struct inode *inode) +static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) { - return (struct ceph_client *)inode->i_sb->s_fs_info; + return (struct ceph_fs_client *)inode->i_sb->s_fs_info; } -static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb) +static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) { - return (struct ceph_client *)sb->s_fs_info; + return (struct ceph_fs_client *)sb->s_fs_info; } @@ -617,51 +541,6 @@ struct ceph_file_info { /* - * snapshots - */ - -/* - * A "snap context" is the set of existing snapshots when we - * write data. It is used by the OSD to guide its COW behavior. - * - * The ceph_snap_context is refcounted, and attached to each dirty - * page, indicating which context the dirty data belonged when it was - * dirtied. - */ -struct ceph_snap_context { - atomic_t nref; - u64 seq; - int num_snaps; - u64 snaps[]; -}; - -static inline struct ceph_snap_context * -ceph_get_snap_context(struct ceph_snap_context *sc) -{ - /* - printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), - atomic_read(&sc->nref)+1); - */ - if (sc) - atomic_inc(&sc->nref); - return sc; -} - -static inline void ceph_put_snap_context(struct ceph_snap_context *sc) -{ - if (!sc) - return; - /* - printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), - atomic_read(&sc->nref)-1); - */ - if (atomic_dec_and_test(&sc->nref)) { - /*printk(" deleting snap_context %p\n", sc);*/ - kfree(sc); - } -} - -/* * A "snap realm" describes a subset of the file hierarchy sharing * the same set of snapshots that apply to it. The realms themselves * are organized into a hierarchy, such that children inherit (some of) @@ -699,16 +578,33 @@ struct ceph_snap_realm { spinlock_t inodes_with_caps_lock; }; - - -/* - * calculate the number of pages a given length and offset map onto, - * if we align the data. - */ -static inline int calc_pages_for(u64 off, u64 len) +static inline int default_congestion_kb(void) { - return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - - (off >> PAGE_CACHE_SHIFT); + int congestion_kb; + + /* + * Copied from NFS + * + * congestion size, scale with available memory. + * + * 64MB: 8192k + * 128MB: 11585k + * 256MB: 16384k + * 512MB: 23170k + * 1GB: 32768k + * 2GB: 46340k + * 4GB: 65536k + * 8GB: 92681k + * 16GB: 131072k + * + * This allows larger machines to have larger/more transfers. + * Limit the default to 256M + */ + congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + if (congestion_kb > 256*1024) + congestion_kb = 256*1024; + + return congestion_kb; } @@ -741,16 +637,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) ci_item)->writing; } - -/* super.c */ -extern struct kmem_cache *ceph_inode_cachep; -extern struct kmem_cache *ceph_cap_cachep; -extern struct kmem_cache *ceph_dentry_cachep; -extern struct kmem_cache *ceph_file_cachep; - -extern const char *ceph_msg_type_name(int type); -extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); - /* inode.c */ extern const struct inode_operations ceph_file_iops; @@ -857,12 +743,18 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); /* file.c */ extern const struct file_operations ceph_file_fops; extern const struct address_space_operations ceph_aops; +extern int ceph_copy_to_page_vector(struct page **pages, + const char *data, + loff_t off, size_t len); +extern int ceph_copy_from_page_vector(struct page **pages, + char *data, + loff_t off, size_t len); +extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); extern int ceph_open(struct inode *inode, struct file *file); extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd, int mode, int locked_dir); extern int ceph_release(struct inode *inode, struct file *filp); -extern void ceph_release_page_vector(struct page **pages, int num_pages); /* dir.c */ extern const struct file_operations ceph_dir_fops; @@ -892,12 +784,6 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* export.c */ extern const struct export_operations ceph_export_ops; -/* debugfs.c */ -extern int ceph_debugfs_init(void); -extern void ceph_debugfs_cleanup(void); -extern int ceph_debugfs_client_init(struct ceph_client *client); -extern void ceph_debugfs_client_cleanup(struct ceph_client *client); - /* locks.c */ extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); @@ -914,4 +800,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) return NULL; } +/* debugfs.c */ +extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); +extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); + #endif /* _FS_CEPH_SUPER_H */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 9578af610b73..6e12a6ba5f79 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1,6 +1,9 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> + #include "super.h" -#include "decode.h" +#include "mds_client.h" + +#include <linux/ceph/decode.h> #include <linux/xattr.h> #include <linux/slab.h> @@ -620,12 +623,12 @@ out: static int ceph_sync_setxattr(struct dentry *dentry, const char *name, const char *value, size_t size, int flags) { - struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); struct inode *parent_inode = dentry->d_parent->d_inode; struct ceph_mds_request *req; - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_client *mdsc = fsc->mdsc; int err; int i, nr_pages; struct page **pages = NULL; @@ -713,10 +716,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name, /* preallocate memory for xattr name, value, index node */ err = -ENOMEM; - newname = kmalloc(name_len + 1, GFP_NOFS); + newname = kmemdup(name, name_len + 1, GFP_NOFS); if (!newname) goto out; - memcpy(newname, name, name_len + 1); if (val_len) { newval = kmalloc(val_len + 1, GFP_NOFS); @@ -777,8 +779,8 @@ out: static int ceph_send_removexattr(struct dentry *dentry, const char *name) { - struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = dentry->d_inode; struct inode *parent_inode = dentry->d_parent->d_inode; struct ceph_mds_request *req; diff --git a/fs/exec.c b/fs/exec.c index 828dd2461d6b..6d2b6f936858 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2014,3 +2014,43 @@ fail_creds: fail: return; } + +/* + * Core dumping helper functions. These are the only things you should + * do on a core-file: use only these functions to write out all the + * necessary info. + */ +int dump_write(struct file *file, const void *addr, int nr) +{ + return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr; +} +EXPORT_SYMBOL(dump_write); + +int dump_seek(struct file *file, loff_t off) +{ + int ret = 1; + + if (file->f_op->llseek && file->f_op->llseek != no_llseek) { + if (file->f_op->llseek(file, off, SEEK_CUR) < 0) + return 0; + } else { + char *buf = (char *)get_zeroed_page(GFP_KERNEL); + + if (!buf) + return 0; + while (off > 0) { + unsigned long n = off; + + if (n > PAGE_SIZE) + n = PAGE_SIZE; + if (!dump_write(file, buf, n)) { + ret = 0; + break; + } + off -= n; + } + free_page((unsigned long)buf); + } + return ret; +} +EXPORT_SYMBOL(dump_seek); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index eb7368ebd8cd..3eadd97324b1 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -54,6 +54,9 @@ struct page_collect { unsigned nr_pages; unsigned long length; loff_t pg_first; /* keep 64bit also in 32-arches */ + bool read_4_write; /* This means two things: that the read is sync + * And the pages should not be unlocked. + */ }; static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, @@ -71,6 +74,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, pcol->nr_pages = 0; pcol->length = 0; pcol->pg_first = -1; + pcol->read_4_write = false; } static void _pcol_reset(struct page_collect *pcol) @@ -347,7 +351,8 @@ static int readpage_strip(void *data, struct page *page) if (PageError(page)) ClearPageError(page); - unlock_page(page); + if (!pcol->read_4_write) + unlock_page(page); EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," " splitting\n", inode->i_ino, page->index); @@ -428,6 +433,7 @@ static int _readpage(struct page *page, bool is_sync) /* readpage_strip might call read_exec(,is_sync==false) at several * places but not if we have a single page. */ + pcol.read_4_write = is_sync; ret = readpage_strip(&pcol, page); if (ret) { EXOFS_ERR("_readpage => %d\n", ret); diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index cc9665522148..c465ae066c62 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -1,6 +1,6 @@ config GFS2_FS tristate "GFS2 file system support" - depends on EXPERIMENTAL && (64BIT || LBDAF) + depends on (64BIT || LBDAF) select DLM if GFS2_FS_LOCKING_DLM select CONFIGFS_FS if GFS2_FS_LOCKING_DLM select SYSFS if GFS2_FS_LOCKING_DLM diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 194fe16d8418..6b24afb96aae 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -36,8 +36,8 @@ #include "glops.h" -static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, - unsigned int from, unsigned int to) +void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, + unsigned int from, unsigned int to) { struct buffer_head *head = page_buffers(page); unsigned int bsize = head->b_size; @@ -615,7 +615,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, unsigned int data_blocks = 0, ind_blocks = 0, rblocks; int alloc_required; int error = 0; - struct gfs2_alloc *al; + struct gfs2_alloc *al = NULL; pgoff_t index = pos >> PAGE_CACHE_SHIFT; unsigned from = pos & (PAGE_CACHE_SIZE - 1); unsigned to = from + len; @@ -663,6 +663,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, rblocks += RES_STATFS + RES_QUOTA; if (&ip->i_inode == sdp->sd_rindex) rblocks += 2 * RES_STATFS; + if (alloc_required) + rblocks += gfs2_rg_blocks(al); error = gfs2_trans_begin(sdp, rblocks, PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); @@ -696,13 +698,11 @@ out: page_cache_release(page); - /* - * XXX(truncate): the call below should probably be replaced with - * a call to the gfs2-specific truncate blocks helper to actually - * release disk blocks.. - */ + gfs2_trans_end(sdp); if (pos + len > ip->i_inode.i_size) - truncate_setsize(&ip->i_inode, ip->i_inode.i_size); + gfs2_trim_blocks(&ip->i_inode); + goto out_trans_fail; + out_endtrans: gfs2_trans_end(sdp); out_trans_fail: @@ -802,10 +802,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, page_cache_release(page); if (copied) { - if (inode->i_size < to) { + if (inode->i_size < to) i_size_write(inode, to); - ip->i_disksize = inode->i_size; - } gfs2_dinode_out(ip, di); mark_inode_dirty(inode); } @@ -876,8 +874,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); if (ret > 0) { - if (inode->i_size > ip->i_disksize) - ip->i_disksize = inode->i_size; gfs2_dinode_out(ip, dibh->b_data); mark_inode_dirty(inode); } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 6f482809d1a3..5476c066d4ee 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -50,7 +50,7 @@ struct strip_mine { * @ip: the inode * @dibh: the dinode buffer * @block: the block number that was allocated - * @private: any locked page held by the caller process + * @page: The (optional) page. This is looked up if @page is NULL * * Returns: errno */ @@ -109,8 +109,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, /** * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big * @ip: The GFS2 inode to unstuff - * @unstuffer: the routine that handles unstuffing a non-zero length file - * @private: private data for the unstuffer + * @page: The (optional) page. This is looked up if the @page is NULL * * This routine unstuffs a dinode and returns it to a "normal" state such * that the height can be grown in the traditional way. @@ -132,7 +131,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) if (error) goto out; - if (ip->i_disksize) { + if (i_size_read(&ip->i_inode)) { /* Get a free block, fill it with the stuffed data, and write it out to disk */ @@ -161,7 +160,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) di = (struct gfs2_dinode *)dibh->b_data; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); - if (ip->i_disksize) { + if (i_size_read(&ip->i_inode)) { *(__be64 *)(di + 1) = cpu_to_be64(block); gfs2_add_inode_blocks(&ip->i_inode, 1); di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); @@ -885,83 +884,14 @@ out: } /** - * do_grow - Make a file look bigger than it is - * @ip: the inode - * @size: the size to set the file to - * - * Called with an exclusive lock on @ip. - * - * Returns: errno - */ - -static int do_grow(struct gfs2_inode *ip, u64 size) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al; - struct buffer_head *dibh; - int error; - - al = gfs2_alloc_get(ip); - if (!al) - return -ENOMEM; - - error = gfs2_quota_lock_check(ip); - if (error) - goto out; - - al->al_requested = sdp->sd_max_height + RES_DATA; - - error = gfs2_inplace_reserve(ip); - if (error) - goto out_gunlock_q; - - error = gfs2_trans_begin(sdp, - sdp->sd_max_height + al->al_rgd->rd_length + - RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0); - if (error) - goto out_ipres; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto out_end_trans; - - if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { - if (gfs2_is_stuffed(ip)) { - error = gfs2_unstuff_dinode(ip, NULL); - if (error) - goto out_brelse; - } - } - - ip->i_disksize = size; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - -out_brelse: - brelse(dibh); -out_end_trans: - gfs2_trans_end(sdp); -out_ipres: - gfs2_inplace_release(ip); -out_gunlock_q: - gfs2_quota_unlock(ip); -out: - gfs2_alloc_put(ip); - return error; -} - - -/** * gfs2_block_truncate_page - Deal with zeroing out data for truncate * * This is partly borrowed from ext3. */ -static int gfs2_block_truncate_page(struct address_space *mapping) +static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from) { struct inode *inode = mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - loff_t from = inode->i_size; unsigned long index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); unsigned blocksize, iblock, length, pos; @@ -1023,9 +953,11 @@ unlock: return err; } -static int trunc_start(struct gfs2_inode *ip, u64 size) +static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct address_space *mapping = inode->i_mapping; struct buffer_head *dibh; int journaled = gfs2_is_jdata(ip); int error; @@ -1039,31 +971,26 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) if (error) goto out; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + if (gfs2_is_stuffed(ip)) { - u64 dsize = size + sizeof(struct gfs2_dinode); - ip->i_disksize = size; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - if (dsize > dibh->b_size) - dsize = dibh->b_size; - gfs2_buffer_clear_tail(dibh, dsize); - error = 1; + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize); } else { - if (size & (u64)(sdp->sd_sb.sb_bsize - 1)) - error = gfs2_block_truncate_page(ip->i_inode.i_mapping); - - if (!error) { - ip->i_disksize = size; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); + if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) { + error = gfs2_block_truncate_page(mapping, newsize); + if (error) + goto out_brelse; } + ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; } - brelse(dibh); + i_size_write(inode, newsize); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_dinode_out(ip, dibh->b_data); + truncate_pagecache(inode, oldsize, newsize); +out_brelse: + brelse(dibh); out: gfs2_trans_end(sdp); return error; @@ -1123,7 +1050,7 @@ static int trunc_end(struct gfs2_inode *ip) if (error) goto out; - if (!ip->i_disksize) { + if (!i_size_read(&ip->i_inode)) { ip->i_height = 0; ip->i_goal = ip->i_no_addr; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); @@ -1143,92 +1070,154 @@ out: /** * do_shrink - make a file smaller - * @ip: the inode - * @size: the size to make the file - * @truncator: function to truncate the last partial block + * @inode: the inode + * @oldsize: the current inode size + * @newsize: the size to make the file * - * Called with an exclusive lock on @ip. + * Called with an exclusive lock on @inode. The @size must + * be equal to or smaller than the current inode size. * * Returns: errno */ -static int do_shrink(struct gfs2_inode *ip, u64 size) +static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize) { + struct gfs2_inode *ip = GFS2_I(inode); int error; - error = trunc_start(ip, size); + error = trunc_start(inode, oldsize, newsize); if (error < 0) return error; - if (error > 0) + if (gfs2_is_stuffed(ip)) return 0; - error = trunc_dealloc(ip, size); - if (!error) + error = trunc_dealloc(ip, newsize); + if (error == 0) error = trunc_end(ip); return error; } -static int do_touch(struct gfs2_inode *ip, u64 size) +void gfs2_trim_blocks(struct inode *inode) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + u64 size = inode->i_size; + int ret; + + ret = do_shrink(inode, size, size); + WARN_ON(ret != 0); +} + +/** + * do_grow - Touch and update inode size + * @inode: The inode + * @size: The new size + * + * This function updates the timestamps on the inode and + * may also increase the size of the inode. This function + * must not be called with @size any smaller than the current + * inode size. + * + * Although it is not strictly required to unstuff files here, + * earlier versions of GFS2 have a bug in the stuffed file reading + * code which will result in a buffer overrun if the size is larger + * than the max stuffed file size. In order to prevent this from + * occuring, such files are unstuffed, but in other cases we can + * just update the inode size directly. + * + * Returns: 0 on success, or -ve on error + */ + +static int do_grow(struct inode *inode, u64 size) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *dibh; + struct gfs2_alloc *al = NULL; int error; - error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (gfs2_is_stuffed(ip) && + (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) { + al = gfs2_alloc_get(ip); + if (al == NULL) + return -ENOMEM; + + error = gfs2_quota_lock_check(ip); + if (error) + goto do_grow_alloc_put; + + al->al_requested = 1; + error = gfs2_inplace_reserve(ip); + if (error) + goto do_grow_qunlock; + } + + error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0); if (error) - return error; + goto do_grow_release; - down_write(&ip->i_rw_mutex); + if (al) { + error = gfs2_unstuff_dinode(ip, NULL); + if (error) + goto do_end_trans; + } error = gfs2_meta_inode_buffer(ip, &dibh); if (error) - goto do_touch_out; + goto do_end_trans; + i_size_write(inode, size); ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); -do_touch_out: - up_write(&ip->i_rw_mutex); +do_end_trans: gfs2_trans_end(sdp); +do_grow_release: + if (al) { + gfs2_inplace_release(ip); +do_grow_qunlock: + gfs2_quota_unlock(ip); +do_grow_alloc_put: + gfs2_alloc_put(ip); + } return error; } /** - * gfs2_truncatei - make a file a given size - * @ip: the inode - * @size: the size to make the file - * @truncator: function to truncate the last partial block + * gfs2_setattr_size - make a file a given size + * @inode: the inode + * @newsize: the size to make the file * - * The file size can grow, shrink, or stay the same size. + * The file size can grow, shrink, or stay the same size. This + * is called holding i_mutex and an exclusive glock on the inode + * in question. * * Returns: errno */ -int gfs2_truncatei(struct gfs2_inode *ip, u64 size) +int gfs2_setattr_size(struct inode *inode, u64 newsize) { - int error; + int ret; + u64 oldsize; - if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode))) - return -EINVAL; + BUG_ON(!S_ISREG(inode->i_mode)); - if (size > ip->i_disksize) - error = do_grow(ip, size); - else if (size < ip->i_disksize) - error = do_shrink(ip, size); - else - /* update time stamps */ - error = do_touch(ip, size); + ret = inode_newsize_ok(inode, newsize); + if (ret) + return ret; - return error; + oldsize = inode->i_size; + if (newsize >= oldsize) + return do_grow(inode, newsize); + + return do_shrink(inode, oldsize, newsize); } int gfs2_truncatei_resume(struct gfs2_inode *ip) { int error; - error = trunc_dealloc(ip, ip->i_disksize); + error = trunc_dealloc(ip, i_size_read(&ip->i_inode)); if (!error) error = trunc_end(ip); return error; @@ -1269,7 +1258,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, shift = sdp->sd_sb.sb_bsize_shift; BUG_ON(gfs2_is_dir(ip)); - end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift; + end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift; lblock = offset >> shift; lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; if (lblock_stop > end_of_file) diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index a20a5213135a..42fea03e2bd9 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -44,14 +44,16 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, } } -int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); -int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); -int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); - -int gfs2_truncatei(struct gfs2_inode *ip, u64 size); -int gfs2_truncatei_resume(struct gfs2_inode *ip); -int gfs2_file_dealloc(struct gfs2_inode *ip); -int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, - unsigned int len); +extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); +extern int gfs2_block_map(struct inode *inode, sector_t lblock, + struct buffer_head *bh, int create); +extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, + u64 *dblock, unsigned *extlen); +extern int gfs2_setattr_size(struct inode *inode, u64 size); +extern void gfs2_trim_blocks(struct inode *inode); +extern int gfs2_truncatei_resume(struct gfs2_inode *ip); +extern int gfs2_file_dealloc(struct gfs2_inode *ip); +extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, + unsigned int len); #endif /* __BMAP_DOT_H__ */ diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index bb7907bde3d8..6798755b3858 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -49,7 +49,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) ip = GFS2_I(inode); } - if (sdp->sd_args.ar_localcaching) + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) goto valid; had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index b9dd88a78dd4..5c356d09c321 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -79,6 +79,9 @@ #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) +struct qstr gfs2_qdot __read_mostly; +struct qstr gfs2_qdotdot __read_mostly; + typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len, u64 leaf_no, void *data); typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent, @@ -127,8 +130,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, gfs2_trans_add_bh(ip->i_gl, dibh, 1); memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); - if (ip->i_disksize < offset + size) - ip->i_disksize = offset + size; + if (ip->i_inode.i_size < offset + size) + i_size_write(&ip->i_inode, offset + size); ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(ip, dibh->b_data); @@ -225,8 +228,8 @@ out: if (error) return error; - if (ip->i_disksize < offset + copied) - ip->i_disksize = offset + copied; + if (ip->i_inode.i_size < offset + copied) + i_size_write(&ip->i_inode, offset + copied); ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); @@ -275,12 +278,13 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, unsigned int o; int copied = 0; int error = 0; + u64 disksize = i_size_read(&ip->i_inode); - if (offset >= ip->i_disksize) + if (offset >= disksize) return 0; - if (offset + size > ip->i_disksize) - size = ip->i_disksize - offset; + if (offset + size > disksize) + size = disksize - offset; if (!size) return 0; @@ -727,7 +731,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, unsigned hsize = 1 << ip->i_depth; unsigned index; u64 ln; - if (hsize * sizeof(u64) != ip->i_disksize) { + if (hsize * sizeof(u64) != i_size_read(inode)) { gfs2_consist_inode(ip); return ERR_PTR(-EIO); } @@ -879,7 +883,7 @@ static int dir_make_exhash(struct inode *inode) for (x = sdp->sd_hash_ptrs; x--; lp++) *lp = cpu_to_be64(bn); - dip->i_disksize = sdp->sd_sb.sb_bsize / 2; + i_size_write(inode, sdp->sd_sb.sb_bsize / 2); gfs2_add_inode_blocks(&dip->i_inode, 1); dip->i_diskflags |= GFS2_DIF_EXHASH; @@ -1057,11 +1061,12 @@ static int dir_double_exhash(struct gfs2_inode *dip) u64 *buf; u64 *from, *to; u64 block; + u64 disksize = i_size_read(&dip->i_inode); int x; int error = 0; hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != dip->i_disksize) { + if (hsize * sizeof(u64) != disksize) { gfs2_consist_inode(dip); return -EIO; } @@ -1072,7 +1077,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) if (!buf) return -ENOMEM; - for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) { + for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) { error = gfs2_dir_read_data(dip, (char *)buf, block * sdp->sd_hash_bsize, sdp->sd_hash_bsize, 1); @@ -1370,7 +1375,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, unsigned depth = 0; hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != dip->i_disksize) { + if (hsize * sizeof(u64) != i_size_read(inode)) { gfs2_consist_inode(dip); return -EIO; } @@ -1784,7 +1789,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data) int error = 0; hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != dip->i_disksize) { + if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) { gfs2_consist_inode(dip); return -EIO; } diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 4f919440c3be..a98f644bd3df 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -17,23 +17,24 @@ struct inode; struct gfs2_inode; struct gfs2_inum; -struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename); -int gfs2_dir_check(struct inode *dir, const struct qstr *filename, - const struct gfs2_inode *ip); -int gfs2_dir_add(struct inode *inode, const struct qstr *filename, - const struct gfs2_inode *ip, unsigned int type); -int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); -int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir); -int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, - const struct gfs2_inode *nip, unsigned int new_type); +extern struct inode *gfs2_dir_search(struct inode *dir, + const struct qstr *filename); +extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, + const struct gfs2_inode *ip); +extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, + const struct gfs2_inode *ip, unsigned int type); +extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); +extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, + filldir_t filldir); +extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, + const struct gfs2_inode *nip, unsigned int new_type); -int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); +extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); -int gfs2_diradd_alloc_required(struct inode *dir, - const struct qstr *filename); -int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, - struct buffer_head **bhp); +extern int gfs2_diradd_alloc_required(struct inode *dir, + const struct qstr *filename); +extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, + struct buffer_head **bhp); static inline u32 gfs2_disk_hash(const char *data, int len) { @@ -61,4 +62,7 @@ static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct memcpy(dent + 1, name->name, name->len); } +extern struct qstr gfs2_qdot; +extern struct qstr gfs2_qdotdot; + #endif /* __DIR_DOT_H__ */ diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index dfe237a3f8ad..06d582732d34 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -126,16 +126,9 @@ static int gfs2_get_name(struct dentry *parent, char *name, static struct dentry *gfs2_get_parent(struct dentry *child) { - struct qstr dotdot; struct dentry *dentry; - /* - * XXX(hch): it would be a good idea to keep this around as a - * static variable. - */ - gfs2_str2qstr(&dotdot, ".."); - - dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1)); + dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1)); if (!IS_ERR(dentry)) dentry->d_op = &gfs2_dops; return dentry; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 4edd662c8232..237ee6a940df 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -382,8 +382,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) rblocks = RES_DINODE + ind_blocks; if (gfs2_is_jdata(ip)) rblocks += data_blocks ? data_blocks : 1; - if (ind_blocks || data_blocks) + if (ind_blocks || data_blocks) { rblocks += RES_STATFS + RES_QUOTA; + rblocks += gfs2_rg_blocks(al); + } ret = gfs2_trans_begin(sdp, rblocks, 0); if (ret) goto out_trans_fail; @@ -491,7 +493,7 @@ static int gfs2_open(struct inode *inode, struct file *file) goto fail; if (!(file->f_flags & O_LARGEFILE) && - ip->i_disksize > MAX_NON_LFS) { + i_size_read(inode) > MAX_NON_LFS) { error = -EOVERFLOW; goto fail_gunlock; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9adf8f924e08..87778857f099 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -441,6 +441,8 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state) else gfs2_glock_put_nolock(gl); } + if (held1 && held2 && list_empty(&gl->gl_holders)) + clear_bit(GLF_QUEUED, &gl->gl_flags); gl->gl_state = new_state; gl->gl_tchange = jiffies; @@ -1012,6 +1014,7 @@ fail: if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) insert_pt = &gh2->gh_list; } + set_bit(GLF_QUEUED, &gl->gl_flags); if (likely(insert_pt == NULL)) { list_add_tail(&gh->gh_list, &gl->gl_holders); if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) @@ -1310,10 +1313,12 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) gfs2_glock_hold(gl); holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; - if (time_before(now, holdtime)) - delay = holdtime - now; - if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) - delay = gl->gl_ops->go_min_hold_time; + if (test_bit(GLF_QUEUED, &gl->gl_flags)) { + if (time_before(now, holdtime)) + delay = holdtime - now; + if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) + delay = gl->gl_ops->go_min_hold_time; + } spin_lock(&gl->gl_spin); handle_callback(gl, state, delay); @@ -1512,7 +1517,7 @@ static void clear_glock(struct gfs2_glock *gl) spin_unlock(&lru_lock); spin_lock(&gl->gl_spin); - if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED) + if (gl->gl_state != LM_ST_UNLOCKED) handle_callback(gl, LM_ST_UNLOCKED, 0); spin_unlock(&gl->gl_spin); gfs2_glock_hold(gl); @@ -1660,6 +1665,8 @@ static const char *gflags2str(char *buf, const unsigned long *gflags) *p++ = 'I'; if (test_bit(GLF_FROZEN, gflags)) *p++ = 'F'; + if (test_bit(GLF_QUEUED, gflags)) + *p++ = 'q'; *p = 0; return buf; } @@ -1776,10 +1783,12 @@ int __init gfs2_glock_init(void) } #endif - glock_workqueue = create_workqueue("glock_workqueue"); + glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER | + WQ_HIGHPRI | WQ_FREEZEABLE, 0); if (IS_ERR(glock_workqueue)) return PTR_ERR(glock_workqueue); - gfs2_delete_workqueue = create_workqueue("delete_workqueue"); + gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER | + WQ_FREEZEABLE, 0); if (IS_ERR(gfs2_delete_workqueue)) { destroy_workqueue(glock_workqueue); return PTR_ERR(gfs2_delete_workqueue); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 2bda1911b156..db1c26d6d220 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -215,7 +215,7 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); /** - * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock + * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock * @gl: the glock * @state: the state we're requesting * @flags: the modifier flags diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 49f97d3bb690..0d149dcc04e5 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -262,13 +262,12 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) const struct gfs2_inode *ip = gl->gl_object; if (ip == NULL) return 0; - gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n", + gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n", (unsigned long long)ip->i_no_formal_ino, (unsigned long long)ip->i_no_addr, IF2DT(ip->i_inode.i_mode), ip->i_flags, (unsigned int)ip->i_diskflags, - (unsigned long long)ip->i_inode.i_size, - (unsigned long long)ip->i_disksize); + (unsigned long long)i_size_read(&ip->i_inode)); return 0; } @@ -453,7 +452,6 @@ const struct gfs2_glock_operations *gfs2_glops_list[] = { [LM_TYPE_META] = &gfs2_meta_glops, [LM_TYPE_INODE] = &gfs2_inode_glops, [LM_TYPE_RGRP] = &gfs2_rgrp_glops, - [LM_TYPE_NONDISK] = &gfs2_trans_glops, [LM_TYPE_IOPEN] = &gfs2_iopen_glops, [LM_TYPE_FLOCK] = &gfs2_flock_glops, [LM_TYPE_NONDISK] = &gfs2_nondisk_glops, diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index fdbf4b366fa5..764fbb49efc8 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -196,6 +196,7 @@ enum { GLF_REPLY_PENDING = 9, GLF_INITIAL = 10, GLF_FROZEN = 11, + GLF_QUEUED = 12, }; struct gfs2_glock { @@ -267,7 +268,6 @@ struct gfs2_inode { u64 i_no_formal_ino; u64 i_generation; u64 i_eattr; - loff_t i_disksize; unsigned long i_flags; /* GIF_... */ struct gfs2_glock *i_gl; /* Move into i_gh? */ struct gfs2_holder i_iopen_gh; @@ -416,11 +416,8 @@ struct gfs2_args { char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ unsigned int ar_spectator:1; /* Don't get a journal */ - unsigned int ar_ignore_local_fs:1; /* Ignore optimisations */ unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */ - unsigned int ar_localcaching:1; /* Local caching */ unsigned int ar_debug:1; /* Oops on errors */ - unsigned int ar_upgrade:1; /* Upgrade ondisk format */ unsigned int ar_posix_acl:1; /* Enable posix acls */ unsigned int ar_quota:2; /* off/account/on */ unsigned int ar_suiddir:1; /* suiddir support */ @@ -497,7 +494,7 @@ struct gfs2_sb_host { */ struct lm_lockstruct { - unsigned int ls_jid; + int ls_jid; unsigned int ls_first; unsigned int ls_first_done; unsigned int ls_nodir; @@ -572,6 +569,7 @@ struct gfs2_sbd { struct list_head sd_rindex_mru_list; struct gfs2_rgrpd *sd_rindex_forward; unsigned int sd_rgrps; + unsigned int sd_max_rg_data; /* Journal index stuff */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 08140f185a37..06370f8bd8cf 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -359,8 +359,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) * to do that. */ ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); - ip->i_disksize = be64_to_cpu(str->di_size); - i_size_write(&ip->i_inode, ip->i_disksize); + i_size_write(&ip->i_inode, be64_to_cpu(str->di_size)); gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); atime.tv_sec = be64_to_cpu(str->di_atime); atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); @@ -1055,7 +1054,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_uid = cpu_to_be32(ip->i_inode.i_uid); str->di_gid = cpu_to_be32(ip->i_inode.i_gid); str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); - str->di_size = cpu_to_be64(ip->i_disksize); + str->di_size = cpu_to_be64(i_size_read(&ip->i_inode)); str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); @@ -1085,8 +1084,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip) (unsigned long long)ip->i_no_formal_ino); printk(KERN_INFO " no_addr = %llu\n", (unsigned long long)ip->i_no_addr); - printk(KERN_INFO " i_disksize = %llu\n", - (unsigned long long)ip->i_disksize); + printk(KERN_INFO " i_size = %llu\n", + (unsigned long long)i_size_read(&ip->i_inode)); printk(KERN_INFO " blocks = %llu\n", (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode)); printk(KERN_INFO " i_goal = %llu\n", diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 300ada3f21de..6720d7d5fbc6 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -19,6 +19,8 @@ extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); extern int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state, char *buf, loff_t *pos, unsigned size); +extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, + unsigned int from, unsigned int to); extern void gfs2_set_aops(struct inode *inode); static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) @@ -80,6 +82,19 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip, dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr); } +static inline int gfs2_check_internal_file_size(struct inode *inode, + u64 minsize, u64 maxsize) +{ + u64 size = i_size_read(inode); + if (size < minsize || size > maxsize) + goto err; + if (size & ((1 << inode->i_blkbits) - 1)) + goto err; + return 0; +err: + gfs2_consist_inode(GFS2_I(inode)); + return -EIO; +} extern void gfs2_set_iop(struct inode *inode); extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 0e0470ed34c2..1c09425b45fd 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -42,9 +42,9 @@ static void gdlm_ast(void *arg) ret |= LM_OUT_CANCELED; goto out; case -EAGAIN: /* Try lock fails */ + case -EDEADLK: /* Deadlock detected */ goto out; - case -EINVAL: /* Invalid */ - case -ENOMEM: /* Out of memory */ + case -ETIMEDOUT: /* Canceled due to timeout */ ret |= LM_OUT_ERROR; goto out; case 0: /* Success */ diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index b1e9630eb46a..d7eb1e209aa8 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -24,6 +24,7 @@ #include "glock.h" #include "quota.h" #include "recovery.h" +#include "dir.h" static struct shrinker qd_shrinker = { .shrink = gfs2_shrink_qd_memory, @@ -78,6 +79,9 @@ static int __init init_gfs2_fs(void) { int error; + gfs2_str2qstr(&gfs2_qdot, "."); + gfs2_str2qstr(&gfs2_qdotdot, ".."); + error = gfs2_sys_init(); if (error) return error; @@ -140,7 +144,7 @@ static int __init init_gfs2_fs(void) error = -ENOMEM; gfs_recovery_wq = alloc_workqueue("gfs_recovery", - WQ_NON_REENTRANT | WQ_RESCUER, 0); + WQ_RESCUER | WQ_FREEZEABLE, 0); if (!gfs_recovery_wq) goto fail_wq; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 4d4b1e8ac64c..aeafc233dc89 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -38,14 +38,6 @@ #define DO 0 #define UNDO 1 -static const u32 gfs2_old_fs_formats[] = { - 0 -}; - -static const u32 gfs2_old_multihost_formats[] = { - 0 -}; - /** * gfs2_tune_init - Fill a gfs2_tune structure with default values * @gt: tune @@ -135,8 +127,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent) { - unsigned int x; - if (sb->sb_magic != GFS2_MAGIC || sb->sb_type != GFS2_METATYPE_SB) { if (!silent) @@ -150,55 +140,9 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int sile sb->sb_multihost_format == GFS2_FORMAT_MULTI) return 0; - if (sb->sb_fs_format != GFS2_FORMAT_FS) { - for (x = 0; gfs2_old_fs_formats[x]; x++) - if (gfs2_old_fs_formats[x] == sb->sb_fs_format) - break; + fs_warn(sdp, "Unknown on-disk format, unable to mount\n"); - if (!gfs2_old_fs_formats[x]) { - printk(KERN_WARNING - "GFS2: code version (%u, %u) is incompatible " - "with ondisk format (%u, %u)\n", - GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, - sb->sb_fs_format, sb->sb_multihost_format); - printk(KERN_WARNING - "GFS2: I don't know how to upgrade this FS\n"); - return -EINVAL; - } - } - - if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) { - for (x = 0; gfs2_old_multihost_formats[x]; x++) - if (gfs2_old_multihost_formats[x] == - sb->sb_multihost_format) - break; - - if (!gfs2_old_multihost_formats[x]) { - printk(KERN_WARNING - "GFS2: code version (%u, %u) is incompatible " - "with ondisk format (%u, %u)\n", - GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, - sb->sb_fs_format, sb->sb_multihost_format); - printk(KERN_WARNING - "GFS2: I don't know how to upgrade this FS\n"); - return -EINVAL; - } - } - - if (!sdp->sd_args.ar_upgrade) { - printk(KERN_WARNING - "GFS2: code version (%u, %u) is incompatible " - "with ondisk format (%u, %u)\n", - GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, - sb->sb_fs_format, sb->sb_multihost_format); - printk(KERN_INFO - "GFS2: Use the \"upgrade\" mount option to upgrade " - "the FS\n"); - printk(KERN_INFO "GFS2: See the manual for more details\n"); - return -EINVAL; - } - - return 0; + return -EINVAL; } static void end_bio_io_page(struct bio *bio, int error) @@ -586,7 +530,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp) prev_db = 0; - for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) { + for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) { bh.b_state = 0; bh.b_blocknr = 0; bh.b_size = 1 << ip->i_inode.i_blkbits; @@ -1022,7 +966,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) if (!strcmp("lock_nolock", proto)) { lm = &nolock_ops; sdp->sd_args.ar_localflocks = 1; - sdp->sd_args.ar_localcaching = 1; #ifdef CONFIG_GFS2_FS_LOCKING_DLM } else if (!strcmp("lock_dlm", proto)) { lm = &gfs2_dlm_ops; @@ -1113,8 +1056,6 @@ static int gfs2_journalid_wait(void *word) static int wait_on_journal(struct gfs2_sbd *sdp) { - if (sdp->sd_args.ar_spectator) - return 0; if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) return 0; @@ -1217,6 +1158,20 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent if (error) goto fail_sb; + /* + * If user space has failed to join the cluster or some similar + * failure has occurred, then the journal id will contain a + * negative (error) number. This will then be returned to the + * caller (of the mount syscall). We do this even for spectator + * mounts (which just write a jid of 0 to indicate "ok" even though + * the jid is unused in the spectator case) + */ + if (sdp->sd_lockstruct.ls_jid < 0) { + error = sdp->sd_lockstruct.ls_jid; + sdp->sd_lockstruct.ls_jid = 0; + goto fail_sb; + } + error = init_inodes(sdp, DO); if (error) goto fail_sb; diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 1009be2c9737..0534510200d5 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -18,6 +18,8 @@ #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/fiemap.h> +#include <linux/swap.h> +#include <linux/falloc.h> #include <asm/uaccess.h> #include "gfs2.h" @@ -217,7 +219,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, goto out_gunlock_q; error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - al->al_rgd->rd_length + + gfs2_rg_blocks(al) + 2 * RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) @@ -406,7 +408,6 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, ip = ghs[1].gh_gl->gl_object; - ip->i_disksize = size; i_size_write(inode, size); error = gfs2_meta_inode_buffer(ip, &dibh); @@ -461,7 +462,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) ip = ghs[1].gh_gl->gl_object; ip->i_inode.i_nlink = 2; - ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); + i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)); ip->i_diskflags |= GFS2_DIF_JDATA; ip->i_entries = 2; @@ -470,18 +471,15 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (!gfs2_assert_withdraw(sdp, !error)) { struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); - struct qstr str; - gfs2_str2qstr(&str, "."); gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent); + gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent); dent->de_inum = di->di_num; /* already GFS2 endian */ dent->de_type = cpu_to_be16(DT_DIR); di->di_entries = cpu_to_be32(1); - gfs2_str2qstr(&str, ".."); dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); - gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); + gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); gfs2_inum_out(dip, dent); dent->de_type = cpu_to_be16(DT_DIR); @@ -522,7 +520,6 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, struct gfs2_inode *ip) { - struct qstr dotname; int error; if (ip->i_entries != 2) { @@ -539,13 +536,11 @@ static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, if (error) return error; - gfs2_str2qstr(&dotname, "."); - error = gfs2_dir_del(ip, &dotname); + error = gfs2_dir_del(ip, &gfs2_qdot); if (error) return error; - gfs2_str2qstr(&dotname, ".."); - error = gfs2_dir_del(ip, &dotname); + error = gfs2_dir_del(ip, &gfs2_qdotdot); if (error) return error; @@ -694,11 +689,8 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) struct inode *dir = &to->i_inode; struct super_block *sb = dir->i_sb; struct inode *tmp; - struct qstr dotdot; int error = 0; - gfs2_str2qstr(&dotdot, ".."); - igrab(dir); for (;;) { @@ -711,7 +703,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) break; } - tmp = gfs2_lookupi(dir, &dotdot, 1); + tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1); if (IS_ERR(tmp)) { error = PTR_ERR(tmp); break; @@ -744,7 +736,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_inode *ip = GFS2_I(odentry->d_inode); struct gfs2_inode *nip = NULL; struct gfs2_sbd *sdp = GFS2_SB(odir); - struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; + struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh; struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; @@ -758,6 +750,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, return 0; } + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + return error; if (odip != ndip) { error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, @@ -887,12 +882,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, al->al_requested = sdp->sd_max_dirres; - error = gfs2_inplace_reserve(ndip); + error = gfs2_inplace_reserve_ri(ndip); if (error) goto out_gunlock_q; error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - al->al_rgd->rd_length + + gfs2_rg_blocks(al) + 4 * RES_DINODE + 4 * RES_LEAF + RES_STATFS + RES_QUOTA + 4, 0); if (error) @@ -920,9 +915,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, } if (dir_rename) { - struct qstr name; - gfs2_str2qstr(&name, ".."); - error = gfs2_change_nlink(ndip, +1); if (error) goto out_end_trans; @@ -930,7 +922,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_end_trans; - error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR); + error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); if (error) goto out_end_trans; } else { @@ -972,6 +964,7 @@ out_gunlock_r: if (r_gh.gh_gl) gfs2_glock_dq_uninit(&r_gh); out: + gfs2_glock_dq_uninit(&ri_gh); return error; } @@ -990,7 +983,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) struct gfs2_inode *ip = GFS2_I(dentry->d_inode); struct gfs2_holder i_gh; struct buffer_head *dibh; - unsigned int x; + unsigned int x, size; char *buf; int error; @@ -1002,7 +995,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) return NULL; } - if (!ip->i_disksize) { + size = (unsigned int)i_size_read(&ip->i_inode); + if (size == 0) { gfs2_consist_inode(ip); buf = ERR_PTR(-EIO); goto out; @@ -1014,7 +1008,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) goto out; } - x = ip->i_disksize + 1; + x = size + 1; buf = kmalloc(x, GFP_NOFS); if (!buf) buf = ERR_PTR(-ENOMEM); @@ -1071,30 +1065,6 @@ int gfs2_permission(struct inode *inode, int mask) return error; } -/* - * XXX(truncate): the truncate_setsize calls should be moved to the end. - */ -static int setattr_size(struct inode *inode, struct iattr *attr) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - int error; - - if (attr->ia_size != ip->i_disksize) { - error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); - if (error) - return error; - truncate_setsize(inode, attr->ia_size); - gfs2_trans_end(sdp); - } - - error = gfs2_truncatei(ip, attr->ia_size); - if (error && (inode->i_size != ip->i_disksize)) - i_size_write(inode, ip->i_disksize); - - return error; -} - static int setattr_chown(struct inode *inode, struct iattr *attr) { struct gfs2_inode *ip = GFS2_I(inode); @@ -1195,7 +1165,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) goto out; if (attr->ia_valid & ATTR_SIZE) - error = setattr_size(inode, attr); + error = gfs2_setattr_size(inode, attr->ia_size); else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) error = setattr_chown(inode, attr); else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) @@ -1301,6 +1271,257 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name) return ret; } +static void empty_write_end(struct page *page, unsigned from, + unsigned to) +{ + struct gfs2_inode *ip = GFS2_I(page->mapping->host); + + page_zero_new_buffers(page, from, to); + flush_dcache_page(page); + mark_page_accessed(page); + + if (!gfs2_is_writeback(ip)) + gfs2_page_add_databufs(ip, page, from, to); + + block_commit_write(page, from, to); +} + + +static int write_empty_blocks(struct page *page, unsigned from, unsigned to) +{ + unsigned start, end, next; + struct buffer_head *bh, *head; + int error; + + if (!page_has_buffers(page)) { + error = block_prepare_write(page, from, to, gfs2_block_map); + if (unlikely(error)) + return error; + + empty_write_end(page, from, to); + return 0; + } + + bh = head = page_buffers(page); + next = end = 0; + while (next < from) { + next += bh->b_size; + bh = bh->b_this_page; + } + start = next; + do { + next += bh->b_size; + if (buffer_mapped(bh)) { + if (end) { + error = block_prepare_write(page, start, end, + gfs2_block_map); + if (unlikely(error)) + return error; + empty_write_end(page, start, end); + end = 0; + } + start = next; + } + else + end = next; + bh = bh->b_this_page; + } while (next < to); + + if (end) { + error = block_prepare_write(page, start, end, gfs2_block_map); + if (unlikely(error)) + return error; + empty_write_end(page, start, end); + } + + return 0; +} + +static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, + int mode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *dibh; + int error; + u64 start = offset >> PAGE_CACHE_SHIFT; + unsigned int start_offset = offset & ~PAGE_CACHE_MASK; + u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT; + pgoff_t curr; + struct page *page; + unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK; + unsigned int from, to; + + if (!end_offset) + end_offset = PAGE_CACHE_SIZE; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (unlikely(error)) + goto out; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + + if (gfs2_is_stuffed(ip)) { + error = gfs2_unstuff_dinode(ip, NULL); + if (unlikely(error)) + goto out; + } + + curr = start; + offset = start << PAGE_CACHE_SHIFT; + from = start_offset; + to = PAGE_CACHE_SIZE; + while (curr <= end) { + page = grab_cache_page_write_begin(inode->i_mapping, curr, + AOP_FLAG_NOFS); + if (unlikely(!page)) { + error = -ENOMEM; + goto out; + } + + if (curr == end) + to = end_offset; + error = write_empty_blocks(page, from, to); + if (!error && offset + to > inode->i_size && + !(mode & FALLOC_FL_KEEP_SIZE)) { + i_size_write(inode, offset + to); + } + unlock_page(page); + page_cache_release(page); + if (error) + goto out; + curr++; + offset += PAGE_CACHE_SIZE; + from = 0; + } + + gfs2_dinode_out(ip, dibh->b_data); + mark_inode_dirty(inode); + + brelse(dibh); + +out: + return error; +} + +static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, + unsigned int *data_blocks, unsigned int *ind_blocks) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone; + unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); + + for (tmp = max_data; tmp > sdp->sd_diptrs;) { + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); + max_data -= tmp; + } + /* This calculation isn't the exact reverse of gfs2_write_calc_reserve, + so it might end up with fewer data blocks */ + if (max_data <= *data_blocks) + return; + *data_blocks = max_data; + *ind_blocks = max_blocks - max_data; + *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; + if (*len > max) { + *len = max; + gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks); + } +} + +static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset, + loff_t len) +{ + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *ip = GFS2_I(inode); + unsigned int data_blocks = 0, ind_blocks = 0, rblocks; + loff_t bytes, max_bytes; + struct gfs2_alloc *al; + int error; + loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; + next = (next + 1) << sdp->sd_sb.sb_bsize_shift; + + offset = (offset >> sdp->sd_sb.sb_bsize_shift) << + sdp->sd_sb.sb_bsize_shift; + + len = next - offset; + bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2; + if (!bytes) + bytes = UINT_MAX; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); + error = gfs2_glock_nq(&ip->i_gh); + if (unlikely(error)) + goto out_uninit; + + if (!gfs2_write_alloc_required(ip, offset, len)) + goto out_unlock; + + while (len > 0) { + if (len < bytes) + bytes = len; + al = gfs2_alloc_get(ip); + if (!al) { + error = -ENOMEM; + goto out_unlock; + } + + error = gfs2_quota_lock_check(ip); + if (error) + goto out_alloc_put; + +retry: + gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); + + al->al_requested = data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip); + if (error) { + if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { + bytes >>= 1; + goto retry; + } + goto out_qunlock; + } + max_bytes = bytes; + calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks); + al->al_requested = data_blocks + ind_blocks; + + rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + + RES_RG_HDR + gfs2_rg_blocks(al); + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + + error = gfs2_trans_begin(sdp, rblocks, + PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); + if (error) + goto out_trans_fail; + + error = fallocate_chunk(inode, offset, max_bytes, mode); + gfs2_trans_end(sdp); + + if (error) + goto out_trans_fail; + + len -= max_bytes; + offset += max_bytes; + gfs2_inplace_release(ip); + gfs2_quota_unlock(ip); + gfs2_alloc_put(ip); + } + goto out_unlock; + +out_trans_fail: + gfs2_inplace_release(ip); +out_qunlock: + gfs2_quota_unlock(ip); +out_alloc_put: + gfs2_alloc_put(ip); +out_unlock: + gfs2_glock_dq(&ip->i_gh); +out_uninit: + gfs2_holder_uninit(&ip->i_gh); + return error; +} + + static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -1351,6 +1572,7 @@ const struct inode_operations gfs2_file_iops = { .getxattr = gfs2_getxattr, .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, + .fallocate = gfs2_fallocate, .fiemap = gfs2_fiemap, }; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 1bc6b5695e6d..58a9b9998b42 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -735,10 +735,8 @@ get_a_page: goto out; size = loc + sizeof(struct gfs2_quota); - if (size > inode->i_size) { - ip->i_disksize = size; + if (size > inode->i_size) i_size_write(inode, size); - } inode->i_mtime = inode->i_atime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -817,7 +815,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) goto out_alloc; if (nalloc) - blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS; + blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS; error = gfs2_trans_begin(sdp, blocks, 0); if (error) @@ -1190,18 +1188,17 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void * int gfs2_quota_init(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); - unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; + u64 size = i_size_read(sdp->sd_qc_inode); + unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift; unsigned int x, slot = 0; unsigned int found = 0; u64 dblock; u32 extlen = 0; int error; - if (!ip->i_disksize || ip->i_disksize > (64 << 20) || - ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) { - gfs2_consist_inode(ip); + if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20)) return -EIO; - } + sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); @@ -1589,6 +1586,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, error = gfs2_inplace_reserve(ip); if (error) goto out_alloc; + blocks += gfs2_rg_blocks(al); } error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index f7f89a94a5a4..f2a02edcac8f 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -455,11 +455,13 @@ void gfs2_recover_func(struct work_struct *work) int ro = 0; unsigned int pass; int error; + int jlocked = 0; - if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { + if (sdp->sd_args.ar_spectator || + (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) { fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", jd->jd_jid); - + jlocked = 1; /* Acquire the journal lock so we can do recovery */ error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, @@ -554,13 +556,12 @@ void gfs2_recover_func(struct work_struct *work) jd->jd_jid, t); } - if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) - gfs2_glock_dq_uninit(&ji_gh); - gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); - if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) + if (jlocked) { + gfs2_glock_dq_uninit(&ji_gh); gfs2_glock_dq_uninit(&j_gh); + } fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); goto done; @@ -568,7 +569,7 @@ void gfs2_recover_func(struct work_struct *work) fail_gunlock_tr: gfs2_glock_dq_uninit(&t_gh); fail_gunlock_ji: - if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { + if (jlocked) { gfs2_glock_dq_uninit(&ji_gh); fail_gunlock_j: gfs2_glock_dq_uninit(&j_gh); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 171a744f8e45..fb67f593f408 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp) for (rgrps = 0;; rgrps++) { loff_t pos = rgrps * sizeof(struct gfs2_rindex); - if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize) + if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode)) break; error = gfs2_internal_read(ip, &ra_state, buf, &pos, sizeof(struct gfs2_rindex)); @@ -588,7 +588,9 @@ static int gfs2_ri_update(struct gfs2_inode *ip) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct inode *inode = &ip->i_inode; struct file_ra_state ra_state; - u64 rgrp_count = ip->i_disksize; + u64 rgrp_count = i_size_read(inode); + struct gfs2_rgrpd *rgd; + unsigned int max_data = 0; int error; do_div(rgrp_count, sizeof(struct gfs2_rindex)); @@ -603,6 +605,10 @@ static int gfs2_ri_update(struct gfs2_inode *ip) } } + list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list) + if (rgd->rd_data > max_data) + max_data = rgd->rd_data; + sdp->sd_max_rg_data = max_data; sdp->sd_rindex_uptodate = 1; return 0; } @@ -622,13 +628,15 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct inode *inode = &ip->i_inode; struct file_ra_state ra_state; + struct gfs2_rgrpd *rgd; + unsigned int max_data = 0; int error; file_ra_state_init(&ra_state, inode->i_mapping); for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { /* Ignore partials */ if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > - ip->i_disksize) + i_size_read(inode)) break; error = read_rindex_entry(ip, &ra_state); if (error) { @@ -636,6 +644,10 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip) return error; } } + list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list) + if (rgd->rd_data > max_data) + max_data = rgd->rd_data; + sdp->sd_max_rg_data = max_data; sdp->sd_rindex_uptodate = 1; return 0; @@ -1188,7 +1200,8 @@ out: * Returns: errno */ -int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) +int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, + char *file, unsigned int line) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al = ip->i_alloc; @@ -1199,12 +1212,15 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) return -EINVAL; try_again: - /* We need to hold the rindex unless the inode we're using is - the rindex itself, in which case it's already held. */ - if (ip != GFS2_I(sdp->sd_rindex)) - error = gfs2_rindex_hold(sdp, &al->al_ri_gh); - else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */ - error = gfs2_ri_update_special(ip); + if (hold_rindex) { + /* We need to hold the rindex unless the inode we're using is + the rindex itself, in which case it's already held. */ + if (ip != GFS2_I(sdp->sd_rindex)) + error = gfs2_rindex_hold(sdp, &al->al_ri_gh); + else if (!sdp->sd_rgrps) /* We may not have the rindex read + in, so: */ + error = gfs2_ri_update_special(ip); + } if (error) return error; @@ -1215,7 +1231,7 @@ try_again: try to free it, and try the allocation again. */ error = get_local_rgrp(ip, &unlinked, &last_unlinked); if (error) { - if (ip != GFS2_I(sdp->sd_rindex)) + if (hold_rindex && ip != GFS2_I(sdp->sd_rindex)) gfs2_glock_dq_uninit(&al->al_ri_gh); if (error != -EAGAIN) return error; @@ -1257,7 +1273,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip) al->al_rgd = NULL; if (al->al_rgd_gh.gh_gl) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (ip != GFS2_I(sdp->sd_rindex)) + if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl) gfs2_glock_dq_uninit(&al->al_ri_gh); } @@ -1496,11 +1512,19 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *dibh; struct gfs2_alloc *al = ip->i_alloc; - struct gfs2_rgrpd *rgd = al->al_rgd; + struct gfs2_rgrpd *rgd; u32 goal, blk; u64 block; int error; + /* Only happens if there is a bug in gfs2, return something distinctive + * to ensure that it is noticed. + */ + if (al == NULL) + return -ECANCELED; + + rgd = al->al_rgd; + if (rgrp_contains_block(rgd, ip->i_goal)) goal = ip->i_goal - rgd->rd_data0; else diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index f07119d89557..0e35c0466f9a 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -39,10 +39,12 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip) ip->i_alloc = NULL; } -extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, - unsigned int line); +extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, + char *file, unsigned int line); #define gfs2_inplace_reserve(ip) \ -gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) + gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__) +#define gfs2_inplace_reserve_ri(ip) \ + gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__) extern void gfs2_inplace_release(struct gfs2_inode *ip); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 77cb9f830ee4..047d1176096c 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -85,6 +85,7 @@ static const match_table_t tokens = { {Opt_locktable, "locktable=%s"}, {Opt_hostdata, "hostdata=%s"}, {Opt_spectator, "spectator"}, + {Opt_spectator, "norecovery"}, {Opt_ignore_local_fs, "ignore_local_fs"}, {Opt_localflocks, "localflocks"}, {Opt_localcaching, "localcaching"}, @@ -159,13 +160,13 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) args->ar_spectator = 1; break; case Opt_ignore_local_fs: - args->ar_ignore_local_fs = 1; + /* Retained for backwards compat only */ break; case Opt_localflocks: args->ar_localflocks = 1; break; case Opt_localcaching: - args->ar_localcaching = 1; + /* Retained for backwards compat only */ break; case Opt_debug: if (args->ar_errors == GFS2_ERRORS_PANIC) { @@ -179,7 +180,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) args->ar_debug = 0; break; case Opt_upgrade: - args->ar_upgrade = 1; + /* Retained for backwards compat only */ break; case Opt_acl: args->ar_posix_acl = 1; @@ -342,15 +343,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) { struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + u64 size = i_size_read(jd->jd_inode); - if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) || - (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) { - gfs2_consist_inode(ip); + if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30)) return -EIO; - } - jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; - if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) { + jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift; + + if (gfs2_write_alloc_required(ip, 0, size)) { gfs2_consist_inode(ip); return -EIO; } @@ -1129,9 +1129,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) /* Some flags must not be changed */ if (args_neq(&args, &sdp->sd_args, spectator) || - args_neq(&args, &sdp->sd_args, ignore_local_fs) || args_neq(&args, &sdp->sd_args, localflocks) || - args_neq(&args, &sdp->sd_args, localcaching) || args_neq(&args, &sdp->sd_args, meta)) return -EINVAL; @@ -1234,16 +1232,10 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) seq_printf(s, ",hostdata=%s", args->ar_hostdata); if (args->ar_spectator) seq_printf(s, ",spectator"); - if (args->ar_ignore_local_fs) - seq_printf(s, ",ignore_local_fs"); if (args->ar_localflocks) seq_printf(s, ",localflocks"); - if (args->ar_localcaching) - seq_printf(s, ",localcaching"); if (args->ar_debug) seq_printf(s, ",debug"); - if (args->ar_upgrade) - seq_printf(s, ",upgrade"); if (args->ar_posix_acl) seq_printf(s, ",acl"); if (args->ar_quota != GFS2_QUOTA_DEFAULT) { diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index ccacffd2faaa..748ccb557c18 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -230,7 +230,10 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len if (gltype > LM_TYPE_JOURNAL) return -EINVAL; - glops = gfs2_glops_list[gltype]; + if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK) + glops = &gfs2_trans_glops; + else + glops = gfs2_glops_list[gltype]; if (glops == NULL) return -EINVAL; if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) @@ -399,31 +402,32 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf) static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) { - return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid); + return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid); } static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { - unsigned jid; + int jid; int rv; - rv = sscanf(buf, "%u", &jid); + rv = sscanf(buf, "%d", &jid); if (rv != 1) return -EINVAL; spin_lock(&sdp->sd_jindex_spin); rv = -EINVAL; - if (sdp->sd_args.ar_spectator) - goto out; if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) goto out; rv = -EBUSY; - if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) + if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) goto out; + rv = 0; + if (sdp->sd_args.ar_spectator && jid > 0) + rv = jid = -EINVAL; sdp->sd_lockstruct.ls_jid = jid; + clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); smp_mb__after_clear_bit(); wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); - rv = 0; out: spin_unlock(&sdp->sd_jindex_spin); return rv ? rv : len; @@ -617,7 +621,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj, add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) - add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); + add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid); if (gfs2_uuid_valid(uuid)) add_uevent_var(env, "UUID=%pUB", uuid); return 0; diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 148d55c14171..cedb0bb96d96 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -39,7 +39,8 @@ {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ {(1UL << GLF_REPLY_PENDING), "r" }, \ {(1UL << GLF_INITIAL), "I" }, \ - {(1UL << GLF_FROZEN), "F" }) + {(1UL << GLF_FROZEN), "F" }, \ + {(1UL << GLF_QUEUED), "q" }) #ifndef NUMPTY #define NUMPTY diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index edf9d4bd908e..fb56b783e028 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -20,11 +20,20 @@ struct gfs2_glock; #define RES_JDATA 1 #define RES_DATA 1 #define RES_LEAF 1 +#define RES_RG_HDR 1 #define RES_RG_BIT 2 #define RES_EATTR 1 #define RES_STATFS 1 #define RES_QUOTA 2 +/* reserve either the number of blocks to be allocated plus the rg header + * block, or all of the blocks in the rg, whichever is smaller */ +static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al) +{ + return (al->al_requested < al->al_rgd->rd_length)? + al->al_requested + 1 : al->al_rgd->rd_length; +} + int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, unsigned int revokes); diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 776af6eb4bcb..30b58f07c8a6 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, goto out_gunlock_q; error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), - blks + al->al_rgd->rd_length + + blks + gfs2_rg_blocks(al) + RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) goto out_ipres; diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c index 4129cdb3f0d8..571abe97b42a 100644 --- a/fs/hfs/bfind.c +++ b/fs/hfs/bfind.c @@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); - down(&tree->tree_lock); + mutex_lock(&tree->tree_lock); return 0; } @@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd) hfs_bnode_put(fd->bnode); kfree(fd->search_key); dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); - up(&fd->tree->tree_lock); + mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; } diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 38a0a9917d7f..3ebc437736fe 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -27,7 +27,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke if (!tree) return NULL; - init_MUTEX(&tree->tree_lock); + mutex_init(&tree->tree_lock); spin_lock_init(&tree->hash_lock); /* Set the correct compare function */ tree->sb = sb; diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h index cc51905ac21d..2a1d712f85dc 100644 --- a/fs/hfs/btree.h +++ b/fs/hfs/btree.h @@ -33,7 +33,7 @@ struct hfs_btree { unsigned int depth; //unsigned int map1_size, map_size; - struct semaphore tree_lock; + struct mutex tree_lock; unsigned int pages_per_bnode; spinlock_t hash_lock; diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index 5007a41f1be9..d182438c7ae4 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); - down(&tree->tree_lock); + mutex_lock(&tree->tree_lock); return 0; } @@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd) hfs_bnode_put(fd->bnode); kfree(fd->search_key); dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); - up(&fd->tree->tree_lock); + mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; } @@ -52,6 +52,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd) rec = (e + b) / 2; len = hfs_brec_lenoff(bnode, rec, &off); keylen = hfs_brec_keylen(bnode, rec); + if (keylen == 0) { + res = -EINVAL; + goto fail; + } hfs_bnode_read(bnode, fd->key, off, keylen); cmpval = bnode->tree->keycmp(fd->key, fd->search_key); if (!cmpval) { @@ -67,6 +71,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd) if (rec != e && e >= 0) { len = hfs_brec_lenoff(bnode, e, &off); keylen = hfs_brec_keylen(bnode, e); + if (keylen == 0) { + res = -EINVAL; + goto fail; + } hfs_bnode_read(bnode, fd->key, off, keylen); } done: @@ -75,6 +83,7 @@ done: fd->keylength = keylen; fd->entryoffset = off + keylen; fd->entrylength = len - keylen; +fail: return res; } @@ -198,6 +207,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt) len = hfs_brec_lenoff(bnode, fd->record, &off); keylen = hfs_brec_keylen(bnode, fd->record); + if (keylen == 0) { + res = -EINVAL; + goto out; + } fd->keyoffset = off; fd->keylength = keylen; fd->entryoffset = off + keylen; diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index ea30afc2a03c..ad57f5991eb1 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -17,6 +17,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max) { + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct page *page; struct address_space *mapping; __be32 *pptr, *curr, *end; @@ -29,8 +30,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma return size; dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); - mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); - mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; + mutex_lock(&sbi->alloc_mutex); + mapping = sbi->alloc_file->i_mapping; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); if (IS_ERR(page)) { start = size; @@ -150,16 +151,17 @@ done: set_page_dirty(page); kunmap(page); *max = offset + (curr - pptr) * 32 + i - start; - HFSPLUS_SB(sb).free_blocks -= *max; + sbi->free_blocks -= *max; sb->s_dirt = 1; dprint(DBG_BITMAP, "-> %u,%u\n", start, *max); out: - mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); + mutex_unlock(&sbi->alloc_mutex); return start; } int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) { + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct page *page; struct address_space *mapping; __be32 *pptr, *curr, *end; @@ -172,11 +174,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count); /* are all of the bits in range? */ - if ((offset + count) > HFSPLUS_SB(sb).total_blocks) + if ((offset + count) > sbi->total_blocks) return -2; - mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); - mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; + mutex_lock(&sbi->alloc_mutex); + mapping = sbi->alloc_file->i_mapping; pnr = offset / PAGE_CACHE_BITS; page = read_mapping_page(mapping, pnr, NULL); pptr = kmap(page); @@ -224,9 +226,9 @@ done: out: set_page_dirty(page); kunmap(page); - HFSPLUS_SB(sb).free_blocks += len; + sbi->free_blocks += len; sb->s_dirt = 1; - mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); + mutex_unlock(&sbi->alloc_mutex); return 0; } diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index c88e5d72a402..2f39d05443e1 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -42,10 +42,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2); if (!recoff) return 0; - if (node->tree->attributes & HFS_TREE_BIGKEYS) - retval = hfs_bnode_read_u16(node, recoff) + 2; - else - retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1; + + retval = hfs_bnode_read_u16(node, recoff) + 2; + if (retval > node->tree->max_key_len + 2) { + printk(KERN_ERR "hfs: keylen %d too large\n", + retval); + retval = 0; + } } return retval; } @@ -216,7 +219,7 @@ skip: static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) { struct hfs_btree *tree; - struct hfs_bnode *node, *new_node; + struct hfs_bnode *node, *new_node, *next_node; struct hfs_bnode_desc node_desc; int num_recs, new_rec_off, new_off, old_rec_off; int data_start, data_end, size; @@ -235,6 +238,17 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) new_node->type = node->type; new_node->height = node->height; + if (node->next) + next_node = hfs_bnode_find(tree, node->next); + else + next_node = NULL; + + if (IS_ERR(next_node)) { + hfs_bnode_put(node); + hfs_bnode_put(new_node); + return next_node; + } + size = tree->node_size / 2 - node->num_recs * 2 - 14; old_rec_off = tree->node_size - 4; num_recs = 1; @@ -248,6 +262,8 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) /* panic? */ hfs_bnode_put(node); hfs_bnode_put(new_node); + if (next_node) + hfs_bnode_put(next_node); return ERR_PTR(-ENOSPC); } @@ -302,8 +318,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc)); /* update next bnode header */ - if (new_node->next) { - struct hfs_bnode *next_node = hfs_bnode_find(tree, new_node->next); + if (next_node) { next_node->prev = new_node->this; hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc)); node_desc.prev = cpu_to_be32(next_node->prev); diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index e49fcee1e293..22e4d4e32999 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -30,7 +30,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) if (!tree) return NULL; - init_MUTEX(&tree->tree_lock); + mutex_init(&tree->tree_lock); spin_lock_init(&tree->hash_lock); tree->sb = sb; tree->cnid = id; @@ -39,10 +39,16 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) goto free_tree; tree->inode = inode; + if (!HFSPLUS_I(tree->inode)->first_blocks) { + printk(KERN_ERR + "hfs: invalid btree extent records (0 size).\n"); + goto free_inode; + } + mapping = tree->inode->i_mapping; page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) - goto free_tree; + goto free_inode; /* Load the header */ head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); @@ -57,27 +63,56 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) tree->max_key_len = be16_to_cpu(head->max_key_len); tree->depth = be16_to_cpu(head->depth); - /* Set the correct compare function */ - if (id == HFSPLUS_EXT_CNID) { + /* Verify the tree and set the correct compare function */ + switch (id) { + case HFSPLUS_EXT_CNID: + if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) { + printk(KERN_ERR "hfs: invalid extent max_key_len %d\n", + tree->max_key_len); + goto fail_page; + } + if (tree->attributes & HFS_TREE_VARIDXKEYS) { + printk(KERN_ERR "hfs: invalid extent btree flag\n"); + goto fail_page; + } + tree->keycmp = hfsplus_ext_cmp_key; - } else if (id == HFSPLUS_CAT_CNID) { - if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) && + break; + case HFSPLUS_CAT_CNID: + if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) { + printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n", + tree->max_key_len); + goto fail_page; + } + if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) { + printk(KERN_ERR "hfs: invalid catalog btree flag\n"); + goto fail_page; + } + + if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) && (head->key_type == HFSPLUS_KEY_BINARY)) tree->keycmp = hfsplus_cat_bin_cmp_key; else { tree->keycmp = hfsplus_cat_case_cmp_key; - HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD; + set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); } - } else { + break; + default: printk(KERN_ERR "hfs: unknown B*Tree requested\n"); goto fail_page; } + if (!(tree->attributes & HFS_TREE_BIGKEYS)) { + printk(KERN_ERR "hfs: invalid btree flag\n"); + goto fail_page; + } + size = tree->node_size; if (!is_power_of_2(size)) goto fail_page; if (!tree->node_count) goto fail_page; + tree->node_size_shift = ffs(size) - 1; tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; @@ -87,10 +122,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) return tree; fail_page: - tree->inode->i_mapping->a_ops = &hfsplus_aops; page_cache_release(page); - free_tree: + free_inode: + tree->inode->i_mapping->a_ops = &hfsplus_aops; iput(tree->inode); + free_tree: kfree(tree); return NULL; } @@ -192,17 +228,18 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) while (!tree->free_nodes) { struct inode *inode = tree->inode; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); u32 count; int res; res = hfsplus_file_extend(inode); if (res) return ERR_PTR(res); - HFSPLUS_I(inode).phys_size = inode->i_size = - (loff_t)HFSPLUS_I(inode).alloc_blocks << - HFSPLUS_SB(tree->sb).alloc_blksz_shift; - HFSPLUS_I(inode).fs_blocks = HFSPLUS_I(inode).alloc_blocks << - HFSPLUS_SB(tree->sb).fs_shift; + hip->phys_size = inode->i_size = + (loff_t)hip->alloc_blocks << + HFSPLUS_SB(tree->sb)->alloc_blksz_shift; + hip->fs_blocks = + hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift; inode_set_bytes(inode, inode->i_size); count = inode->i_size >> tree->node_size_shift; tree->free_nodes = count - tree->node_count; diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index f6874acb2cf2..8af45fc5b051 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -67,7 +67,7 @@ static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent, key->key_len = cpu_to_be16(6 + ustrlen); } -static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms) +void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms) { if (inode->i_flags & S_IMMUTABLE) perms->rootflags |= HFSPLUS_FLG_IMMUTABLE; @@ -77,15 +77,24 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms) perms->rootflags |= HFSPLUS_FLG_APPEND; else perms->rootflags &= ~HFSPLUS_FLG_APPEND; - HFSPLUS_I(inode).rootflags = perms->rootflags; - HFSPLUS_I(inode).userflags = perms->userflags; + + perms->userflags = HFSPLUS_I(inode)->userflags; perms->mode = cpu_to_be16(inode->i_mode); perms->owner = cpu_to_be32(inode->i_uid); perms->group = cpu_to_be32(inode->i_gid); + + if (S_ISREG(inode->i_mode)) + perms->dev = cpu_to_be32(inode->i_nlink); + else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) + perms->dev = cpu_to_be32(inode->i_rdev); + else + perms->dev = 0; } static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode) { + struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); + if (S_ISDIR(inode->i_mode)) { struct hfsplus_cat_folder *folder; @@ -93,13 +102,13 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i memset(folder, 0, sizeof(*folder)); folder->type = cpu_to_be16(HFSPLUS_FOLDER); folder->id = cpu_to_be32(inode->i_ino); - HFSPLUS_I(inode).create_date = + HFSPLUS_I(inode)->create_date = folder->create_date = folder->content_mod_date = folder->attribute_mod_date = folder->access_date = hfsp_now2mt(); - hfsplus_set_perms(inode, &folder->permissions); - if (inode == HFSPLUS_SB(inode->i_sb).hidden_dir) + hfsplus_cat_set_perms(inode, &folder->permissions); + if (inode == sbi->hidden_dir) /* invisible and namelocked */ folder->user_info.frFlags = cpu_to_be16(0x5000); return sizeof(*folder); @@ -111,19 +120,19 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i file->type = cpu_to_be16(HFSPLUS_FILE); file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS); file->id = cpu_to_be32(cnid); - HFSPLUS_I(inode).create_date = + HFSPLUS_I(inode)->create_date = file->create_date = file->content_mod_date = file->attribute_mod_date = file->access_date = hfsp_now2mt(); if (cnid == inode->i_ino) { - hfsplus_set_perms(inode, &file->permissions); + hfsplus_cat_set_perms(inode, &file->permissions); if (S_ISLNK(inode->i_mode)) { file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE); file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR); } else { - file->user_info.fdType = cpu_to_be32(HFSPLUS_SB(inode->i_sb).type); - file->user_info.fdCreator = cpu_to_be32(HFSPLUS_SB(inode->i_sb).creator); + file->user_info.fdType = cpu_to_be32(sbi->type); + file->user_info.fdCreator = cpu_to_be32(sbi->creator); } if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); @@ -131,8 +140,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE); file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR); file->user_info.fdFlags = cpu_to_be16(0x100); - file->create_date = HFSPLUS_I(HFSPLUS_SB(inode->i_sb).hidden_dir).create_date; - file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev); + file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date; + file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid); } return sizeof(*file); } @@ -180,15 +189,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid, int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode) { + struct super_block *sb = dir->i_sb; struct hfs_find_data fd; - struct super_block *sb; hfsplus_cat_entry entry; int entry_size; int err; dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); - sb = dir->i_sb; - hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); + hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ? @@ -234,7 +242,7 @@ err2: int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) { - struct super_block *sb; + struct super_block *sb = dir->i_sb; struct hfs_find_data fd; struct hfsplus_fork_raw fork; struct list_head *pos; @@ -242,8 +250,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) u16 type; dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); - sb = dir->i_sb; - hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); + hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (!str) { int len; @@ -279,7 +286,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC); } - list_for_each(pos, &HFSPLUS_I(dir).open_dir_list) { + list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) { struct hfsplus_readdir_data *rd = list_entry(pos, struct hfsplus_readdir_data, list); if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) @@ -312,7 +319,7 @@ int hfsplus_rename_cat(u32 cnid, struct inode *src_dir, struct qstr *src_name, struct inode *dst_dir, struct qstr *dst_name) { - struct super_block *sb; + struct super_block *sb = src_dir->i_sb; struct hfs_find_data src_fd, dst_fd; hfsplus_cat_entry entry; int entry_size, type; @@ -320,8 +327,7 @@ int hfsplus_rename_cat(u32 cnid, dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); - sb = src_dir->i_sb; - hfs_find_init(HFSPLUS_SB(sb).cat_tree, &src_fd); + hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); dst_fd = src_fd; /* find the old dir entry and read the data */ diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 764fd1bdca88..d236d85ec9d7 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -39,7 +39,7 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry, dentry->d_op = &hfsplus_dentry_operations; dentry->d_fsdata = NULL; - hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); + hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); again: err = hfs_brec_read(&fd, &entry, sizeof(entry)); @@ -68,9 +68,9 @@ again: cnid = be32_to_cpu(entry.file.id); if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) && entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) && - (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb).hidden_dir).create_date || - entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode).create_date) && - HFSPLUS_SB(sb).hidden_dir) { + (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date || + entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) && + HFSPLUS_SB(sb)->hidden_dir) { struct qstr str; char name[32]; @@ -86,7 +86,8 @@ again: linkid = be32_to_cpu(entry.file.permissions.dev); str.len = sprintf(name, "iNode%d", linkid); str.name = name; - hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb).hidden_dir->i_ino, &str); + hfsplus_cat_build_key(sb, fd.search_key, + HFSPLUS_SB(sb)->hidden_dir->i_ino, &str); goto again; } } else if (!dentry->d_fsdata) @@ -101,7 +102,7 @@ again: if (IS_ERR(inode)) return ERR_CAST(inode); if (S_ISREG(inode->i_mode)) - HFSPLUS_I(inode).dev = linkid; + HFSPLUS_I(inode)->linkid = linkid; out: d_add(dentry, inode); return NULL; @@ -124,7 +125,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) if (filp->f_pos >= inode->i_size) return 0; - hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); + hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); err = hfs_brec_find(&fd); if (err) @@ -180,8 +181,9 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) err = -EIO; goto out; } - if (HFSPLUS_SB(sb).hidden_dir && - HFSPLUS_SB(sb).hidden_dir->i_ino == be32_to_cpu(entry.folder.id)) + if (HFSPLUS_SB(sb)->hidden_dir && + HFSPLUS_SB(sb)->hidden_dir->i_ino == + be32_to_cpu(entry.folder.id)) goto next; if (filldir(dirent, strbuf, len, filp->f_pos, be32_to_cpu(entry.folder.id), DT_DIR)) @@ -217,7 +219,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) } filp->private_data = rd; rd->file = filp; - list_add(&rd->list, &HFSPLUS_I(inode).open_dir_list); + list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list); } memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); out: @@ -229,38 +231,18 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file) { struct hfsplus_readdir_data *rd = file->private_data; if (rd) { + mutex_lock(&inode->i_mutex); list_del(&rd->list); + mutex_unlock(&inode->i_mutex); kfree(rd); } return 0; } -static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) -{ - struct inode *inode; - int res; - - inode = hfsplus_new_inode(dir->i_sb, mode); - if (!inode) - return -ENOSPC; - - res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); - if (res) { - inode->i_nlink = 0; - hfsplus_delete_inode(inode); - iput(inode); - return res; - } - hfsplus_instantiate(dentry, inode, inode->i_ino); - mark_inode_dirty(inode); - return 0; -} - static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, struct dentry *dst_dentry) { - struct super_block *sb = dst_dir->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb); struct inode *inode = src_dentry->d_inode; struct inode *src_dir = src_dentry->d_parent->d_inode; struct qstr str; @@ -270,7 +252,10 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, if (HFSPLUS_IS_RSRC(inode)) return -EPERM; + if (!S_ISREG(inode->i_mode)) + return -EPERM; + mutex_lock(&sbi->vh_mutex); if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) { for (;;) { get_random_bytes(&id, sizeof(cnid)); @@ -279,40 +264,41 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, str.len = sprintf(name, "iNode%d", id); res = hfsplus_rename_cat(inode->i_ino, src_dir, &src_dentry->d_name, - HFSPLUS_SB(sb).hidden_dir, &str); + sbi->hidden_dir, &str); if (!res) break; if (res != -EEXIST) - return res; + goto out; } - HFSPLUS_I(inode).dev = id; - cnid = HFSPLUS_SB(sb).next_cnid++; + HFSPLUS_I(inode)->linkid = id; + cnid = sbi->next_cnid++; src_dentry->d_fsdata = (void *)(unsigned long)cnid; res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode); if (res) /* panic? */ - return res; - HFSPLUS_SB(sb).file_count++; + goto out; + sbi->file_count++; } - cnid = HFSPLUS_SB(sb).next_cnid++; + cnid = sbi->next_cnid++; res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode); if (res) - return res; + goto out; inc_nlink(inode); hfsplus_instantiate(dst_dentry, inode, cnid); atomic_inc(&inode->i_count); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - HFSPLUS_SB(sb).file_count++; - sb->s_dirt = 1; - - return 0; + sbi->file_count++; + dst_dir->i_sb->s_dirt = 1; +out: + mutex_unlock(&sbi->vh_mutex); + return res; } static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) { - struct super_block *sb = dir->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); struct inode *inode = dentry->d_inode; struct qstr str; char name[32]; @@ -322,21 +308,22 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) if (HFSPLUS_IS_RSRC(inode)) return -EPERM; + mutex_lock(&sbi->vh_mutex); cnid = (u32)(unsigned long)dentry->d_fsdata; if (inode->i_ino == cnid && - atomic_read(&HFSPLUS_I(inode).opencnt)) { + atomic_read(&HFSPLUS_I(inode)->opencnt)) { str.name = name; str.len = sprintf(name, "temp%lu", inode->i_ino); res = hfsplus_rename_cat(inode->i_ino, dir, &dentry->d_name, - HFSPLUS_SB(sb).hidden_dir, &str); + sbi->hidden_dir, &str); if (!res) inode->i_flags |= S_DEAD; - return res; + goto out; } res = hfsplus_delete_cat(cnid, dir, &dentry->d_name); if (res) - return res; + goto out; if (inode->i_nlink > 0) drop_nlink(inode); @@ -344,10 +331,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) clear_nlink(inode); if (!inode->i_nlink) { if (inode->i_ino != cnid) { - HFSPLUS_SB(sb).file_count--; - if (!atomic_read(&HFSPLUS_I(inode).opencnt)) { + sbi->file_count--; + if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) { res = hfsplus_delete_cat(inode->i_ino, - HFSPLUS_SB(sb).hidden_dir, + sbi->hidden_dir, NULL); if (!res) hfsplus_delete_inode(inode); @@ -356,107 +343,108 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) } else hfsplus_delete_inode(inode); } else - HFSPLUS_SB(sb).file_count--; + sbi->file_count--; inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - +out: + mutex_unlock(&sbi->vh_mutex); return res; } -static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode) -{ - struct inode *inode; - int res; - - inode = hfsplus_new_inode(dir->i_sb, S_IFDIR | mode); - if (!inode) - return -ENOSPC; - - res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); - if (res) { - inode->i_nlink = 0; - hfsplus_delete_inode(inode); - iput(inode); - return res; - } - hfsplus_instantiate(dentry, inode, inode->i_ino); - mark_inode_dirty(inode); - return 0; -} - static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); + struct inode *inode = dentry->d_inode; int res; - inode = dentry->d_inode; if (inode->i_size != 2) return -ENOTEMPTY; + + mutex_lock(&sbi->vh_mutex); res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); if (res) - return res; + goto out; clear_nlink(inode); inode->i_ctime = CURRENT_TIME_SEC; hfsplus_delete_inode(inode); mark_inode_dirty(inode); - return 0; +out: + mutex_unlock(&sbi->vh_mutex); + return res; } static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct super_block *sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); struct inode *inode; - int res; + int res = -ENOSPC; - sb = dir->i_sb; - inode = hfsplus_new_inode(sb, S_IFLNK | S_IRWXUGO); + mutex_lock(&sbi->vh_mutex); + inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO); if (!inode) - return -ENOSPC; + goto out; res = page_symlink(inode, symname, strlen(symname) + 1); - if (res) { - inode->i_nlink = 0; - hfsplus_delete_inode(inode); - iput(inode); - return res; - } + if (res) + goto out_err; - mark_inode_dirty(inode); res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); + if (res) + goto out_err; - if (!res) { - hfsplus_instantiate(dentry, inode, inode->i_ino); - mark_inode_dirty(inode); - } + hfsplus_instantiate(dentry, inode, inode->i_ino); + mark_inode_dirty(inode); + goto out; +out_err: + inode->i_nlink = 0; + hfsplus_delete_inode(inode); + iput(inode); +out: + mutex_unlock(&sbi->vh_mutex); return res; } static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) { - struct super_block *sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); struct inode *inode; - int res; + int res = -ENOSPC; - sb = dir->i_sb; - inode = hfsplus_new_inode(sb, mode); + mutex_lock(&sbi->vh_mutex); + inode = hfsplus_new_inode(dir->i_sb, mode); if (!inode) - return -ENOSPC; + goto out; + + if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) + init_special_inode(inode, mode, rdev); res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); if (res) { inode->i_nlink = 0; hfsplus_delete_inode(inode); iput(inode); - return res; + goto out; } - init_special_inode(inode, mode, rdev); + hfsplus_instantiate(dentry, inode, inode->i_ino); mark_inode_dirty(inode); +out: + mutex_unlock(&sbi->vh_mutex); + return res; +} - return 0; +static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + return hfsplus_mknod(dir, dentry, mode, 0); +} + +static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0); } static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, @@ -466,7 +454,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, /* Unlink destination if it already exists */ if (new_dentry->d_inode) { - res = hfsplus_unlink(new_dir, new_dentry); + if (S_ISDIR(new_dentry->d_inode->i_mode)) + res = hfsplus_rmdir(new_dir, new_dentry); + else + res = hfsplus_unlink(new_dir, new_dentry); if (res) return res; } diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 0022eec63cda..0c9cb1820a52 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -85,35 +85,49 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext) static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) { + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); int res; - hfsplus_ext_build_key(fd->search_key, inode->i_ino, HFSPLUS_I(inode).cached_start, - HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); + WARN_ON(!mutex_is_locked(&hip->extents_lock)); + + hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start, + HFSPLUS_IS_RSRC(inode) ? + HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); + res = hfs_brec_find(fd); - if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_NEW) { + if (hip->flags & HFSPLUS_FLG_EXT_NEW) { if (res != -ENOENT) return; - hfs_brec_insert(fd, HFSPLUS_I(inode).cached_extents, sizeof(hfsplus_extent_rec)); - HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); + hfs_brec_insert(fd, hip->cached_extents, + sizeof(hfsplus_extent_rec)); + hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); } else { if (res) return; - hfs_bnode_write(fd->bnode, HFSPLUS_I(inode).cached_extents, fd->entryoffset, fd->entrylength); - HFSPLUS_I(inode).flags &= ~HFSPLUS_FLG_EXT_DIRTY; + hfs_bnode_write(fd->bnode, hip->cached_extents, + fd->entryoffset, fd->entrylength); + hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY; } } -void hfsplus_ext_write_extent(struct inode *inode) +static void hfsplus_ext_write_extent_locked(struct inode *inode) { - if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) { + if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) { struct hfs_find_data fd; - hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd); + hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); __hfsplus_ext_write_extent(inode, &fd); hfs_find_exit(&fd); } } +void hfsplus_ext_write_extent(struct inode *inode) +{ + mutex_lock(&HFSPLUS_I(inode)->extents_lock); + hfsplus_ext_write_extent_locked(inode); + mutex_unlock(&HFSPLUS_I(inode)->extents_lock); +} + static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, struct hfsplus_extent *extent, u32 cnid, u32 block, u8 type) @@ -136,33 +150,39 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block) { + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); int res; - if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) + WARN_ON(!mutex_is_locked(&hip->extents_lock)); + + if (hip->flags & HFSPLUS_FLG_EXT_DIRTY) __hfsplus_ext_write_extent(inode, fd); - res = __hfsplus_ext_read_extent(fd, HFSPLUS_I(inode).cached_extents, inode->i_ino, - block, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); + res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino, + block, HFSPLUS_IS_RSRC(inode) ? + HFSPLUS_TYPE_RSRC : + HFSPLUS_TYPE_DATA); if (!res) { - HFSPLUS_I(inode).cached_start = be32_to_cpu(fd->key->ext.start_block); - HFSPLUS_I(inode).cached_blocks = hfsplus_ext_block_count(HFSPLUS_I(inode).cached_extents); + hip->cached_start = be32_to_cpu(fd->key->ext.start_block); + hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents); } else { - HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0; - HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); + hip->cached_start = hip->cached_blocks = 0; + hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); } return res; } static int hfsplus_ext_read_extent(struct inode *inode, u32 block) { + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); struct hfs_find_data fd; int res; - if (block >= HFSPLUS_I(inode).cached_start && - block < HFSPLUS_I(inode).cached_start + HFSPLUS_I(inode).cached_blocks) + if (block >= hip->cached_start && + block < hip->cached_start + hip->cached_blocks) return 0; - hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd); + hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); res = __hfsplus_ext_cache_extent(&fd, inode, block); hfs_find_exit(&fd); return res; @@ -172,21 +192,21 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block) int hfsplus_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - struct super_block *sb; + struct super_block *sb = inode->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); int res = -EIO; u32 ablock, dblock, mask; int shift; - sb = inode->i_sb; - /* Convert inode block to disk allocation block */ - shift = HFSPLUS_SB(sb).alloc_blksz_shift - sb->s_blocksize_bits; - ablock = iblock >> HFSPLUS_SB(sb).fs_shift; + shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits; + ablock = iblock >> sbi->fs_shift; - if (iblock >= HFSPLUS_I(inode).fs_blocks) { - if (iblock > HFSPLUS_I(inode).fs_blocks || !create) + if (iblock >= hip->fs_blocks) { + if (iblock > hip->fs_blocks || !create) return -EIO; - if (ablock >= HFSPLUS_I(inode).alloc_blocks) { + if (ablock >= hip->alloc_blocks) { res = hfsplus_file_extend(inode); if (res) return res; @@ -194,33 +214,33 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, } else create = 0; - if (ablock < HFSPLUS_I(inode).first_blocks) { - dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).first_extents, ablock); + if (ablock < hip->first_blocks) { + dblock = hfsplus_ext_find_block(hip->first_extents, ablock); goto done; } if (inode->i_ino == HFSPLUS_EXT_CNID) return -EIO; - mutex_lock(&HFSPLUS_I(inode).extents_lock); + mutex_lock(&hip->extents_lock); res = hfsplus_ext_read_extent(inode, ablock); if (!res) { - dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock - - HFSPLUS_I(inode).cached_start); + dblock = hfsplus_ext_find_block(hip->cached_extents, + ablock - hip->cached_start); } else { - mutex_unlock(&HFSPLUS_I(inode).extents_lock); + mutex_unlock(&hip->extents_lock); return -EIO; } - mutex_unlock(&HFSPLUS_I(inode).extents_lock); + mutex_unlock(&hip->extents_lock); done: dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); - mask = (1 << HFSPLUS_SB(sb).fs_shift) - 1; - map_bh(bh_result, sb, (dblock << HFSPLUS_SB(sb).fs_shift) + HFSPLUS_SB(sb).blockoffset + (iblock & mask)); + mask = (1 << sbi->fs_shift) - 1; + map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask)); if (create) { set_buffer_new(bh_result); - HFSPLUS_I(inode).phys_size += sb->s_blocksize; - HFSPLUS_I(inode).fs_blocks++; + hip->phys_size += sb->s_blocksize; + hip->fs_blocks++; inode_add_bytes(inode, sb->s_blocksize); mark_inode_dirty(inode); } @@ -327,7 +347,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw if (total_blocks == blocks) return 0; - hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); + hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); do { res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid, total_blocks, type); @@ -348,29 +368,33 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw int hfsplus_file_extend(struct inode *inode) { struct super_block *sb = inode->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); u32 start, len, goal; int res; - if (HFSPLUS_SB(sb).alloc_file->i_size * 8 < HFSPLUS_SB(sb).total_blocks - HFSPLUS_SB(sb).free_blocks + 8) { + if (sbi->alloc_file->i_size * 8 < + sbi->total_blocks - sbi->free_blocks + 8) { // extend alloc file - printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", HFSPLUS_SB(sb).alloc_file->i_size * 8, - HFSPLUS_SB(sb).total_blocks, HFSPLUS_SB(sb).free_blocks); + printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", + sbi->alloc_file->i_size * 8, + sbi->total_blocks, sbi->free_blocks); return -ENOSPC; } - mutex_lock(&HFSPLUS_I(inode).extents_lock); - if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks) - goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents); + mutex_lock(&hip->extents_lock); + if (hip->alloc_blocks == hip->first_blocks) + goal = hfsplus_ext_lastblock(hip->first_extents); else { - res = hfsplus_ext_read_extent(inode, HFSPLUS_I(inode).alloc_blocks); + res = hfsplus_ext_read_extent(inode, hip->alloc_blocks); if (res) goto out; - goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).cached_extents); + goal = hfsplus_ext_lastblock(hip->cached_extents); } - len = HFSPLUS_I(inode).clump_blocks; - start = hfsplus_block_allocate(sb, HFSPLUS_SB(sb).total_blocks, goal, &len); - if (start >= HFSPLUS_SB(sb).total_blocks) { + len = hip->clump_blocks; + start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len); + if (start >= sbi->total_blocks) { start = hfsplus_block_allocate(sb, goal, 0, &len); if (start >= goal) { res = -ENOSPC; @@ -379,56 +403,56 @@ int hfsplus_file_extend(struct inode *inode) } dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); - if (HFSPLUS_I(inode).alloc_blocks <= HFSPLUS_I(inode).first_blocks) { - if (!HFSPLUS_I(inode).first_blocks) { + + if (hip->alloc_blocks <= hip->first_blocks) { + if (!hip->first_blocks) { dprint(DBG_EXTENT, "first extents\n"); /* no extents yet */ - HFSPLUS_I(inode).first_extents[0].start_block = cpu_to_be32(start); - HFSPLUS_I(inode).first_extents[0].block_count = cpu_to_be32(len); + hip->first_extents[0].start_block = cpu_to_be32(start); + hip->first_extents[0].block_count = cpu_to_be32(len); res = 0; } else { /* try to append to extents in inode */ - res = hfsplus_add_extent(HFSPLUS_I(inode).first_extents, - HFSPLUS_I(inode).alloc_blocks, + res = hfsplus_add_extent(hip->first_extents, + hip->alloc_blocks, start, len); if (res == -ENOSPC) goto insert_extent; } if (!res) { - hfsplus_dump_extent(HFSPLUS_I(inode).first_extents); - HFSPLUS_I(inode).first_blocks += len; + hfsplus_dump_extent(hip->first_extents); + hip->first_blocks += len; } } else { - res = hfsplus_add_extent(HFSPLUS_I(inode).cached_extents, - HFSPLUS_I(inode).alloc_blocks - - HFSPLUS_I(inode).cached_start, + res = hfsplus_add_extent(hip->cached_extents, + hip->alloc_blocks - hip->cached_start, start, len); if (!res) { - hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); - HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY; - HFSPLUS_I(inode).cached_blocks += len; + hfsplus_dump_extent(hip->cached_extents); + hip->flags |= HFSPLUS_FLG_EXT_DIRTY; + hip->cached_blocks += len; } else if (res == -ENOSPC) goto insert_extent; } out: - mutex_unlock(&HFSPLUS_I(inode).extents_lock); + mutex_unlock(&hip->extents_lock); if (!res) { - HFSPLUS_I(inode).alloc_blocks += len; + hip->alloc_blocks += len; mark_inode_dirty(inode); } return res; insert_extent: dprint(DBG_EXTENT, "insert new extent\n"); - hfsplus_ext_write_extent(inode); + hfsplus_ext_write_extent_locked(inode); - memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); - HFSPLUS_I(inode).cached_extents[0].start_block = cpu_to_be32(start); - HFSPLUS_I(inode).cached_extents[0].block_count = cpu_to_be32(len); - hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); - HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW; - HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).alloc_blocks; - HFSPLUS_I(inode).cached_blocks = len; + memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); + hip->cached_extents[0].start_block = cpu_to_be32(start); + hip->cached_extents[0].block_count = cpu_to_be32(len); + hfsplus_dump_extent(hip->cached_extents); + hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW; + hip->cached_start = hip->alloc_blocks; + hip->cached_blocks = len; res = 0; goto out; @@ -437,13 +461,15 @@ insert_extent: void hfsplus_file_truncate(struct inode *inode) { struct super_block *sb = inode->i_sb; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); struct hfs_find_data fd; u32 alloc_cnt, blk_cnt, start; int res; - dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino, - (long long)HFSPLUS_I(inode).phys_size, inode->i_size); - if (inode->i_size > HFSPLUS_I(inode).phys_size) { + dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", + inode->i_ino, (long long)hip->phys_size, inode->i_size); + + if (inode->i_size > hip->phys_size) { struct address_space *mapping = inode->i_mapping; struct page *page; void *fsdata; @@ -460,47 +486,48 @@ void hfsplus_file_truncate(struct inode *inode) return; mark_inode_dirty(inode); return; - } else if (inode->i_size == HFSPLUS_I(inode).phys_size) + } else if (inode->i_size == hip->phys_size) return; - blk_cnt = (inode->i_size + HFSPLUS_SB(sb).alloc_blksz - 1) >> HFSPLUS_SB(sb).alloc_blksz_shift; - alloc_cnt = HFSPLUS_I(inode).alloc_blocks; + blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >> + HFSPLUS_SB(sb)->alloc_blksz_shift; + alloc_cnt = hip->alloc_blocks; if (blk_cnt == alloc_cnt) goto out; - mutex_lock(&HFSPLUS_I(inode).extents_lock); - hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); + mutex_lock(&hip->extents_lock); + hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); while (1) { - if (alloc_cnt == HFSPLUS_I(inode).first_blocks) { - hfsplus_free_extents(sb, HFSPLUS_I(inode).first_extents, + if (alloc_cnt == hip->first_blocks) { + hfsplus_free_extents(sb, hip->first_extents, alloc_cnt, alloc_cnt - blk_cnt); - hfsplus_dump_extent(HFSPLUS_I(inode).first_extents); - HFSPLUS_I(inode).first_blocks = blk_cnt; + hfsplus_dump_extent(hip->first_extents); + hip->first_blocks = blk_cnt; break; } res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); if (res) break; - start = HFSPLUS_I(inode).cached_start; - hfsplus_free_extents(sb, HFSPLUS_I(inode).cached_extents, + start = hip->cached_start; + hfsplus_free_extents(sb, hip->cached_extents, alloc_cnt - start, alloc_cnt - blk_cnt); - hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); + hfsplus_dump_extent(hip->cached_extents); if (blk_cnt > start) { - HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY; + hip->flags |= HFSPLUS_FLG_EXT_DIRTY; break; } alloc_cnt = start; - HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0; - HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); + hip->cached_start = hip->cached_blocks = 0; + hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); hfs_brec_remove(&fd); } hfs_find_exit(&fd); - mutex_unlock(&HFSPLUS_I(inode).extents_lock); + mutex_unlock(&hip->extents_lock); - HFSPLUS_I(inode).alloc_blocks = blk_cnt; + hip->alloc_blocks = blk_cnt; out: - HFSPLUS_I(inode).phys_size = inode->i_size; - HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; - inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits); + hip->phys_size = inode->i_size; + hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits); mark_inode_dirty(inode); } diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index dc856be3c2b0..cb3653efb57a 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -62,7 +62,7 @@ struct hfs_btree { unsigned int depth; //unsigned int map1_size, map_size; - struct semaphore tree_lock; + struct mutex tree_lock; unsigned int pages_per_bnode; spinlock_t hash_lock; @@ -121,16 +121,21 @@ struct hfsplus_sb_info { u32 sect_count; int fs_shift; - /* Stuff in host order from Vol Header */ + /* immutable data from the volume header */ u32 alloc_blksz; int alloc_blksz_shift; u32 total_blocks; + u32 data_clump_blocks, rsrc_clump_blocks; + + /* mutable data from the volume header, protected by alloc_mutex */ u32 free_blocks; - u32 next_alloc; + struct mutex alloc_mutex; + + /* mutable data from the volume header, protected by vh_mutex */ u32 next_cnid; u32 file_count; u32 folder_count; - u32 data_clump_blocks, rsrc_clump_blocks; + struct mutex vh_mutex; /* Config options */ u32 creator; @@ -143,40 +148,50 @@ struct hfsplus_sb_info { int part, session; unsigned long flags; - - struct hlist_head rsrc_inodes; }; -#define HFSPLUS_SB_WRITEBACKUP 0x0001 -#define HFSPLUS_SB_NODECOMPOSE 0x0002 -#define HFSPLUS_SB_FORCE 0x0004 -#define HFSPLUS_SB_HFSX 0x0008 -#define HFSPLUS_SB_CASEFOLD 0x0010 +#define HFSPLUS_SB_WRITEBACKUP 0 +#define HFSPLUS_SB_NODECOMPOSE 1 +#define HFSPLUS_SB_FORCE 2 +#define HFSPLUS_SB_HFSX 3 +#define HFSPLUS_SB_CASEFOLD 4 struct hfsplus_inode_info { - struct mutex extents_lock; - u32 clump_blocks, alloc_blocks; - sector_t fs_blocks; - /* Allocation extents from catalog record or volume header */ - hfsplus_extent_rec first_extents; - u32 first_blocks; - hfsplus_extent_rec cached_extents; - u32 cached_start, cached_blocks; atomic_t opencnt; - struct inode *rsrc_inode; + /* + * Extent allocation information, protected by extents_lock. + */ + u32 first_blocks; + u32 clump_blocks; + u32 alloc_blocks; + u32 cached_start; + u32 cached_blocks; + hfsplus_extent_rec first_extents; + hfsplus_extent_rec cached_extents; unsigned long flags; + struct mutex extents_lock; + /* + * Immutable data. + */ + struct inode *rsrc_inode; __be32 create_date; - /* Device number in hfsplus_permissions in catalog */ - u32 dev; - /* BSD system and user file flags */ - u8 rootflags; - u8 userflags; + /* + * Protected by sbi->vh_mutex. + */ + u32 linkid; + + /* + * Protected by i_mutex. + */ + sector_t fs_blocks; + u8 userflags; /* BSD user file flags */ struct list_head open_dir_list; loff_t phys_size; + struct inode vfs_inode; }; @@ -184,8 +199,8 @@ struct hfsplus_inode_info { #define HFSPLUS_FLG_EXT_DIRTY 0x0002 #define HFSPLUS_FLG_EXT_NEW 0x0004 -#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC)) -#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC) +#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)) +#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC) struct hfs_find_data { /* filled by caller */ @@ -311,6 +326,7 @@ int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *); int hfsplus_delete_cat(u32, struct inode *, struct qstr *); int hfsplus_rename_cat(u32, struct inode *, struct qstr *, struct inode *, struct qstr *); +void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms); /* dir.c */ extern const struct inode_operations hfsplus_dir_inode_operations; @@ -372,26 +388,15 @@ int hfsplus_read_wrapper(struct super_block *); int hfs_part_find(struct super_block *, sector_t *, sector_t *); /* access macros */ -/* static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb) { return sb->s_fs_info; } + static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) { return list_entry(inode, struct hfsplus_inode_info, vfs_inode); } -*/ -#define HFSPLUS_SB(super) (*(struct hfsplus_sb_info *)(super)->s_fs_info) -#define HFSPLUS_I(inode) (*list_entry(inode, struct hfsplus_inode_info, vfs_inode)) - -#if 1 -#define hfsplus_kmap(p) ({ struct page *__p = (p); kmap(__p); }) -#define hfsplus_kunmap(p) ({ struct page *__p = (p); kunmap(__p); __p; }) -#else -#define hfsplus_kmap(p) kmap(p) -#define hfsplus_kunmap(p) kunmap(p) -#endif #define sb_bread512(sb, sec, data) ({ \ struct buffer_head *__bh; \ @@ -419,6 +424,4 @@ static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) #define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) #define hfsp_now2mt() __hfsp_ut2mt(get_seconds()) -#define kdev_t_to_nr(x) (x) - #endif diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h index fe99fe8db61a..6892899fd6fb 100644 --- a/fs/hfsplus/hfsplus_raw.h +++ b/fs/hfsplus/hfsplus_raw.h @@ -200,6 +200,7 @@ struct hfsplus_cat_key { struct hfsplus_unistr name; } __packed; +#define HFSPLUS_CAT_KEYLEN (sizeof(struct hfsplus_cat_key)) /* Structs from hfs.h */ struct hfsp_point { @@ -323,7 +324,7 @@ struct hfsplus_ext_key { __be32 start_block; } __packed; -#define HFSPLUS_EXT_KEYLEN 12 +#define HFSPLUS_EXT_KEYLEN sizeof(struct hfsplus_ext_key) /* HFS+ generic BTree key */ typedef union { diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index c5a979d62c65..78449280dae0 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -36,7 +36,7 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping, *pagep = NULL; ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, hfsplus_get_block, - &HFSPLUS_I(mapping->host).phys_size); + &HFSPLUS_I(mapping->host)->phys_size); if (unlikely(ret)) { loff_t isize = mapping->host->i_size; if (pos + len > isize) @@ -62,13 +62,13 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask) switch (inode->i_ino) { case HFSPLUS_EXT_CNID: - tree = HFSPLUS_SB(sb).ext_tree; + tree = HFSPLUS_SB(sb)->ext_tree; break; case HFSPLUS_CAT_CNID: - tree = HFSPLUS_SB(sb).cat_tree; + tree = HFSPLUS_SB(sb)->cat_tree; break; case HFSPLUS_ATTR_CNID: - tree = HFSPLUS_SB(sb).attr_tree; + tree = HFSPLUS_SB(sb)->attr_tree; break; default: BUG(); @@ -172,12 +172,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent struct hfs_find_data fd; struct super_block *sb = dir->i_sb; struct inode *inode = NULL; + struct hfsplus_inode_info *hip; int err; if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) goto out; - inode = HFSPLUS_I(dir).rsrc_inode; + inode = HFSPLUS_I(dir)->rsrc_inode; if (inode) goto out; @@ -185,12 +186,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent if (!inode) return ERR_PTR(-ENOMEM); + hip = HFSPLUS_I(inode); inode->i_ino = dir->i_ino; - INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); - mutex_init(&HFSPLUS_I(inode).extents_lock); - HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC; + INIT_LIST_HEAD(&hip->open_dir_list); + mutex_init(&hip->extents_lock); + hip->flags = HFSPLUS_FLG_RSRC; - hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); + hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); err = hfsplus_find_cat(sb, dir->i_ino, &fd); if (!err) err = hfsplus_cat_read_inode(inode, &fd); @@ -199,10 +201,18 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent iput(inode); return ERR_PTR(err); } - HFSPLUS_I(inode).rsrc_inode = dir; - HFSPLUS_I(dir).rsrc_inode = inode; + hip->rsrc_inode = dir; + HFSPLUS_I(dir)->rsrc_inode = inode; igrab(dir); - hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes); + + /* + * __mark_inode_dirty expects inodes to be hashed. Since we don't + * want resource fork inodes in the regular inode space, we make them + * appear hashed, but do not put on any lists. hlist_del() + * will work fine and require no locking. + */ + inode->i_hash.pprev = &inode->i_hash.next; + mark_inode_dirty(inode); out: d_add(dentry, inode); @@ -211,30 +221,27 @@ out: static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir) { - struct super_block *sb = inode->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); u16 mode; mode = be16_to_cpu(perms->mode); inode->i_uid = be32_to_cpu(perms->owner); if (!inode->i_uid && !mode) - inode->i_uid = HFSPLUS_SB(sb).uid; + inode->i_uid = sbi->uid; inode->i_gid = be32_to_cpu(perms->group); if (!inode->i_gid && !mode) - inode->i_gid = HFSPLUS_SB(sb).gid; + inode->i_gid = sbi->gid; if (dir) { - mode = mode ? (mode & S_IALLUGO) : - (S_IRWXUGO & ~(HFSPLUS_SB(sb).umask)); + mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask)); mode |= S_IFDIR; } else if (!mode) - mode = S_IFREG | ((S_IRUGO|S_IWUGO) & - ~(HFSPLUS_SB(sb).umask)); + mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask)); inode->i_mode = mode; - HFSPLUS_I(inode).rootflags = perms->rootflags; - HFSPLUS_I(inode).userflags = perms->userflags; + HFSPLUS_I(inode)->userflags = perms->userflags; if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; else @@ -245,30 +252,13 @@ static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, i inode->i_flags &= ~S_APPEND; } -static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms) -{ - if (inode->i_flags & S_IMMUTABLE) - perms->rootflags |= HFSPLUS_FLG_IMMUTABLE; - else - perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE; - if (inode->i_flags & S_APPEND) - perms->rootflags |= HFSPLUS_FLG_APPEND; - else - perms->rootflags &= ~HFSPLUS_FLG_APPEND; - perms->userflags = HFSPLUS_I(inode).userflags; - perms->mode = cpu_to_be16(inode->i_mode); - perms->owner = cpu_to_be32(inode->i_uid); - perms->group = cpu_to_be32(inode->i_gid); - perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev); -} - static int hfsplus_file_open(struct inode *inode, struct file *file) { if (HFSPLUS_IS_RSRC(inode)) - inode = HFSPLUS_I(inode).rsrc_inode; + inode = HFSPLUS_I(inode)->rsrc_inode; if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) return -EOVERFLOW; - atomic_inc(&HFSPLUS_I(inode).opencnt); + atomic_inc(&HFSPLUS_I(inode)->opencnt); return 0; } @@ -277,12 +267,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) struct super_block *sb = inode->i_sb; if (HFSPLUS_IS_RSRC(inode)) - inode = HFSPLUS_I(inode).rsrc_inode; - if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) { + inode = HFSPLUS_I(inode)->rsrc_inode; + if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) { mutex_lock(&inode->i_mutex); hfsplus_file_truncate(inode); if (inode->i_flags & S_DEAD) { - hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); + hfsplus_delete_cat(inode->i_ino, + HFSPLUS_SB(sb)->hidden_dir, NULL); hfsplus_delete_inode(inode); } mutex_unlock(&inode->i_mutex); @@ -361,47 +352,52 @@ static const struct file_operations hfsplus_file_operations = { struct inode *hfsplus_new_inode(struct super_block *sb, int mode) { + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct inode *inode = new_inode(sb); + struct hfsplus_inode_info *hip; + if (!inode) return NULL; - inode->i_ino = HFSPLUS_SB(sb).next_cnid++; + inode->i_ino = sbi->next_cnid++; inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_nlink = 1; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; - INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); - mutex_init(&HFSPLUS_I(inode).extents_lock); - atomic_set(&HFSPLUS_I(inode).opencnt, 0); - HFSPLUS_I(inode).flags = 0; - memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec)); - memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); - HFSPLUS_I(inode).alloc_blocks = 0; - HFSPLUS_I(inode).first_blocks = 0; - HFSPLUS_I(inode).cached_start = 0; - HFSPLUS_I(inode).cached_blocks = 0; - HFSPLUS_I(inode).phys_size = 0; - HFSPLUS_I(inode).fs_blocks = 0; - HFSPLUS_I(inode).rsrc_inode = NULL; + + hip = HFSPLUS_I(inode); + INIT_LIST_HEAD(&hip->open_dir_list); + mutex_init(&hip->extents_lock); + atomic_set(&hip->opencnt, 0); + hip->flags = 0; + memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec)); + memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); + hip->alloc_blocks = 0; + hip->first_blocks = 0; + hip->cached_start = 0; + hip->cached_blocks = 0; + hip->phys_size = 0; + hip->fs_blocks = 0; + hip->rsrc_inode = NULL; if (S_ISDIR(inode->i_mode)) { inode->i_size = 2; - HFSPLUS_SB(sb).folder_count++; + sbi->folder_count++; inode->i_op = &hfsplus_dir_inode_operations; inode->i_fop = &hfsplus_dir_operations; } else if (S_ISREG(inode->i_mode)) { - HFSPLUS_SB(sb).file_count++; + sbi->file_count++; inode->i_op = &hfsplus_file_inode_operations; inode->i_fop = &hfsplus_file_operations; inode->i_mapping->a_ops = &hfsplus_aops; - HFSPLUS_I(inode).clump_blocks = HFSPLUS_SB(sb).data_clump_blocks; + hip->clump_blocks = sbi->data_clump_blocks; } else if (S_ISLNK(inode->i_mode)) { - HFSPLUS_SB(sb).file_count++; + sbi->file_count++; inode->i_op = &page_symlink_inode_operations; inode->i_mapping->a_ops = &hfsplus_aops; - HFSPLUS_I(inode).clump_blocks = 1; + hip->clump_blocks = 1; } else - HFSPLUS_SB(sb).file_count++; + sbi->file_count++; insert_inode_hash(inode); mark_inode_dirty(inode); sb->s_dirt = 1; @@ -414,11 +410,11 @@ void hfsplus_delete_inode(struct inode *inode) struct super_block *sb = inode->i_sb; if (S_ISDIR(inode->i_mode)) { - HFSPLUS_SB(sb).folder_count--; + HFSPLUS_SB(sb)->folder_count--; sb->s_dirt = 1; return; } - HFSPLUS_SB(sb).file_count--; + HFSPLUS_SB(sb)->file_count--; if (S_ISREG(inode->i_mode)) { if (!inode->i_nlink) { inode->i_size = 0; @@ -434,34 +430,39 @@ void hfsplus_delete_inode(struct inode *inode) void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork) { struct super_block *sb = inode->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); u32 count; int i; - memcpy(&HFSPLUS_I(inode).first_extents, &fork->extents, - sizeof(hfsplus_extent_rec)); + memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec)); for (count = 0, i = 0; i < 8; i++) count += be32_to_cpu(fork->extents[i].block_count); - HFSPLUS_I(inode).first_blocks = count; - memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); - HFSPLUS_I(inode).cached_start = 0; - HFSPLUS_I(inode).cached_blocks = 0; - - HFSPLUS_I(inode).alloc_blocks = be32_to_cpu(fork->total_blocks); - inode->i_size = HFSPLUS_I(inode).phys_size = be64_to_cpu(fork->total_size); - HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; - inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits); - HFSPLUS_I(inode).clump_blocks = be32_to_cpu(fork->clump_size) >> HFSPLUS_SB(sb).alloc_blksz_shift; - if (!HFSPLUS_I(inode).clump_blocks) - HFSPLUS_I(inode).clump_blocks = HFSPLUS_IS_RSRC(inode) ? HFSPLUS_SB(sb).rsrc_clump_blocks : - HFSPLUS_SB(sb).data_clump_blocks; + hip->first_blocks = count; + memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); + hip->cached_start = 0; + hip->cached_blocks = 0; + + hip->alloc_blocks = be32_to_cpu(fork->total_blocks); + hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size); + hip->fs_blocks = + (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits); + hip->clump_blocks = + be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift; + if (!hip->clump_blocks) { + hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ? + sbi->rsrc_clump_blocks : + sbi->data_clump_blocks; + } } void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork) { - memcpy(&fork->extents, &HFSPLUS_I(inode).first_extents, + memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents, sizeof(hfsplus_extent_rec)); fork->total_size = cpu_to_be64(inode->i_size); - fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode).alloc_blocks); + fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks); } int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) @@ -472,7 +473,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset); - HFSPLUS_I(inode).dev = 0; + HFSPLUS_I(inode)->linkid = 0; if (type == HFSPLUS_FOLDER) { struct hfsplus_cat_folder *folder = &entry.folder; @@ -486,8 +487,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) inode->i_atime = hfsp_mt2ut(folder->access_date); inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); - HFSPLUS_I(inode).create_date = folder->create_date; - HFSPLUS_I(inode).fs_blocks = 0; + HFSPLUS_I(inode)->create_date = folder->create_date; + HFSPLUS_I(inode)->fs_blocks = 0; inode->i_op = &hfsplus_dir_inode_operations; inode->i_fop = &hfsplus_dir_operations; } else if (type == HFSPLUS_FILE) { @@ -518,7 +519,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) inode->i_atime = hfsp_mt2ut(file->access_date); inode->i_mtime = hfsp_mt2ut(file->content_mod_date); inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); - HFSPLUS_I(inode).create_date = file->create_date; + HFSPLUS_I(inode)->create_date = file->create_date; } else { printk(KERN_ERR "hfs: bad catalog entry used to create inode\n"); res = -EIO; @@ -533,12 +534,12 @@ int hfsplus_cat_write_inode(struct inode *inode) hfsplus_cat_entry entry; if (HFSPLUS_IS_RSRC(inode)) - main_inode = HFSPLUS_I(inode).rsrc_inode; + main_inode = HFSPLUS_I(inode)->rsrc_inode; if (!main_inode->i_nlink) return 0; - if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb).cat_tree, &fd)) + if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd)) /* panic? */ return -EIO; @@ -554,7 +555,7 @@ int hfsplus_cat_write_inode(struct inode *inode) hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_folder)); /* simple node checks? */ - hfsplus_set_perms(inode, &folder->permissions); + hfsplus_cat_set_perms(inode, &folder->permissions); folder->access_date = hfsp_ut2mt(inode->i_atime); folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); @@ -576,11 +577,7 @@ int hfsplus_cat_write_inode(struct inode *inode) hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_file)); hfsplus_inode_write_fork(inode, &file->data_fork); - if (S_ISREG(inode->i_mode)) - HFSPLUS_I(inode).dev = inode->i_nlink; - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) - HFSPLUS_I(inode).dev = kdev_t_to_nr(inode->i_rdev); - hfsplus_set_perms(inode, &file->permissions); + hfsplus_cat_set_perms(inode, &file->permissions); if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); else diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index ac405f099026..5b4667e08ef7 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -17,83 +17,98 @@ #include <linux/mount.h> #include <linux/sched.h> #include <linux/xattr.h> -#include <linux/smp_lock.h> #include <asm/uaccess.h> #include "hfsplus_fs.h" -long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + unsigned int flags = 0; + + if (inode->i_flags & S_IMMUTABLE) + flags |= FS_IMMUTABLE_FL; + if (inode->i_flags |= S_APPEND) + flags |= FS_APPEND_FL; + if (hip->userflags & HFSPLUS_FLG_NODUMP) + flags |= FS_NODUMP_FL; + + return put_user(flags, user_flags); +} + +static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); unsigned int flags; + int err = 0; - lock_kernel(); - switch (cmd) { - case HFSPLUS_IOC_EXT2_GETFLAGS: - flags = 0; - if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE) - flags |= FS_IMMUTABLE_FL; /* EXT2_IMMUTABLE_FL */ - if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND) - flags |= FS_APPEND_FL; /* EXT2_APPEND_FL */ - if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP) - flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */ - return put_user(flags, (int __user *)arg); - case HFSPLUS_IOC_EXT2_SETFLAGS: { - int err = 0; - err = mnt_want_write(filp->f_path.mnt); - if (err) { - unlock_kernel(); - return err; - } + err = mnt_want_write(file->f_path.mnt); + if (err) + goto out; - if (!is_owner_or_cap(inode)) { - err = -EACCES; - goto setflags_out; - } - if (get_user(flags, (int __user *)arg)) { - err = -EFAULT; - goto setflags_out; - } - if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) || - HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) { - if (!capable(CAP_LINUX_IMMUTABLE)) { - err = -EPERM; - goto setflags_out; - } - } + if (!is_owner_or_cap(inode)) { + err = -EACCES; + goto out_drop_write; + } - /* don't silently ignore unsupported ext2 flags */ - if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) { - err = -EOPNOTSUPP; - goto setflags_out; - } - if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */ - inode->i_flags |= S_IMMUTABLE; - HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE; - } else { - inode->i_flags &= ~S_IMMUTABLE; - HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE; - } - if (flags & FS_APPEND_FL) { /* EXT2_APPEND_FL */ - inode->i_flags |= S_APPEND; - HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND; - } else { - inode->i_flags &= ~S_APPEND; - HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND; + if (get_user(flags, user_flags)) { + err = -EFAULT; + goto out_drop_write; + } + + mutex_lock(&inode->i_mutex); + + if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) || + inode->i_flags & (S_IMMUTABLE|S_APPEND)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto out_unlock_inode; } - if (flags & FS_NODUMP_FL) /* EXT2_NODUMP_FL */ - HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP; - else - HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP; - - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); -setflags_out: - mnt_drop_write(filp->f_path.mnt); - unlock_kernel(); - return err; } + + /* don't silently ignore unsupported ext2 flags */ + if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) { + err = -EOPNOTSUPP; + goto out_unlock_inode; + } + + if (flags & FS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + + if (flags & FS_APPEND_FL) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + + if (flags & FS_NODUMP_FL) + hip->userflags |= HFSPLUS_FLG_NODUMP; + else + hip->userflags &= ~HFSPLUS_FLG_NODUMP; + + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + +out_unlock_inode: + mutex_lock(&inode->i_mutex); +out_drop_write: + mnt_drop_write(file->f_path.mnt); +out: + return err; +} + +long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + + switch (cmd) { + case HFSPLUS_IOC_EXT2_GETFLAGS: + return hfsplus_ioctl_getflags(file, argp); + case HFSPLUS_IOC_EXT2_SETFLAGS: + return hfsplus_ioctl_setflags(file, argp); default: - unlock_kernel(); return -ENOTTY; } } @@ -110,7 +125,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name, if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) return -EOPNOTSUPP; - res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); if (res) return res; res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); @@ -153,7 +168,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, return -EOPNOTSUPP; if (size) { - res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); if (res) return res; res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); @@ -177,7 +192,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, } else res = size ? -ERANGE : 4; } else - res = -ENODATA; + res = -EOPNOTSUPP; out: if (size) hfs_find_exit(&fd); diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index 572628b4b07d..f9ab276a4d8d 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -143,13 +143,13 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) kfree(p); break; case opt_decompose: - sbi->flags &= ~HFSPLUS_SB_NODECOMPOSE; + clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); break; case opt_nodecompose: - sbi->flags |= HFSPLUS_SB_NODECOMPOSE; + set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); break; case opt_force: - sbi->flags |= HFSPLUS_SB_FORCE; + set_bit(HFSPLUS_SB_FORCE, &sbi->flags); break; default: return 0; @@ -171,7 +171,7 @@ done: int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt) { - struct hfsplus_sb_info *sbi = &HFSPLUS_SB(mnt->mnt_sb); + struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb); if (sbi->creator != HFSPLUS_DEF_CR_TYPE) seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); @@ -184,7 +184,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt) seq_printf(seq, ",session=%u", sbi->session); if (sbi->nls) seq_printf(seq, ",nls=%s", sbi->nls->charset); - if (sbi->flags & HFSPLUS_SB_NODECOMPOSE) + if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags)) seq_printf(seq, ",nodecompose"); return 0; } diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c index 1528a6fd0299..208b16c645cc 100644 --- a/fs/hfsplus/part_tbl.c +++ b/fs/hfsplus/part_tbl.c @@ -74,6 +74,7 @@ struct old_pmap { int hfs_part_find(struct super_block *sb, sector_t *part_start, sector_t *part_size) { + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct buffer_head *bh; __be16 *data; int i, size, res; @@ -95,7 +96,7 @@ int hfs_part_find(struct super_block *sb, for (i = 0; i < size; p++, i++) { if (p->pdStart && p->pdSize && p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ && - (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) { + (sbi->part < 0 || sbi->part == i)) { *part_start += be32_to_cpu(p->pdStart); *part_size = be32_to_cpu(p->pdSize); res = 0; @@ -111,7 +112,7 @@ int hfs_part_find(struct super_block *sb, size = be32_to_cpu(pm->pmMapBlkCnt); for (i = 0; i < size;) { if (!memcmp(pm->pmPartType,"Apple_HFS", 9) && - (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) { + (sbi->part < 0 || sbi->part == i)) { *part_start += be32_to_cpu(pm->pmPyPartStart); *part_size = be32_to_cpu(pm->pmPartBlkCnt); res = 0; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 3b55c050c742..9a88d7536103 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -12,7 +12,6 @@ #include <linux/pagemap.h> #include <linux/fs.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/vfs.h> #include <linux/nls.h> @@ -21,40 +20,11 @@ static void hfsplus_destroy_inode(struct inode *inode); #include "hfsplus_fs.h" -struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) +static int hfsplus_system_read_inode(struct inode *inode) { - struct hfs_find_data fd; - struct hfsplus_vh *vhdr; - struct inode *inode; - long err = -EIO; - - inode = iget_locked(sb, ino); - if (!inode) - return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) - return inode; + struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr; - INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); - mutex_init(&HFSPLUS_I(inode).extents_lock); - HFSPLUS_I(inode).flags = 0; - HFSPLUS_I(inode).rsrc_inode = NULL; - atomic_set(&HFSPLUS_I(inode).opencnt, 0); - - if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) { - read_inode: - hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); - err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); - if (!err) - err = hfsplus_cat_read_inode(inode, &fd); - hfs_find_exit(&fd); - if (err) - goto bad_inode; - goto done; - } - vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr; - switch(inode->i_ino) { - case HFSPLUS_ROOT_CNID: - goto read_inode; + switch (inode->i_ino) { case HFSPLUS_EXT_CNID: hfsplus_inode_read_fork(inode, &vhdr->ext_file); inode->i_mapping->a_ops = &hfsplus_btree_aops; @@ -75,74 +45,101 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) inode->i_mapping->a_ops = &hfsplus_btree_aops; break; default: - goto bad_inode; + return -EIO; + } + + return 0; +} + +struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) +{ + struct hfs_find_data fd; + struct inode *inode; + int err; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list); + mutex_init(&HFSPLUS_I(inode)->extents_lock); + HFSPLUS_I(inode)->flags = 0; + HFSPLUS_I(inode)->rsrc_inode = NULL; + atomic_set(&HFSPLUS_I(inode)->opencnt, 0); + + if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || + inode->i_ino == HFSPLUS_ROOT_CNID) { + hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); + err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); + if (!err) + err = hfsplus_cat_read_inode(inode, &fd); + hfs_find_exit(&fd); + } else { + err = hfsplus_system_read_inode(inode); + } + + if (err) { + iget_failed(inode); + return ERR_PTR(err); } -done: unlock_new_inode(inode); return inode; - -bad_inode: - iget_failed(inode); - return ERR_PTR(err); } -static int hfsplus_write_inode(struct inode *inode, - struct writeback_control *wbc) +static int hfsplus_system_write_inode(struct inode *inode) { - struct hfsplus_vh *vhdr; - int ret = 0; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); + struct hfsplus_vh *vhdr = sbi->s_vhdr; + struct hfsplus_fork_raw *fork; + struct hfs_btree *tree = NULL; - dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); - hfsplus_ext_write_extent(inode); - if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) { - return hfsplus_cat_write_inode(inode); - } - vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr; switch (inode->i_ino) { - case HFSPLUS_ROOT_CNID: - ret = hfsplus_cat_write_inode(inode); - break; case HFSPLUS_EXT_CNID: - if (vhdr->ext_file.total_size != cpu_to_be64(inode->i_size)) { - HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; - inode->i_sb->s_dirt = 1; - } - hfsplus_inode_write_fork(inode, &vhdr->ext_file); - hfs_btree_write(HFSPLUS_SB(inode->i_sb).ext_tree); + fork = &vhdr->ext_file; + tree = sbi->ext_tree; break; case HFSPLUS_CAT_CNID: - if (vhdr->cat_file.total_size != cpu_to_be64(inode->i_size)) { - HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; - inode->i_sb->s_dirt = 1; - } - hfsplus_inode_write_fork(inode, &vhdr->cat_file); - hfs_btree_write(HFSPLUS_SB(inode->i_sb).cat_tree); + fork = &vhdr->cat_file; + tree = sbi->cat_tree; break; case HFSPLUS_ALLOC_CNID: - if (vhdr->alloc_file.total_size != cpu_to_be64(inode->i_size)) { - HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; - inode->i_sb->s_dirt = 1; - } - hfsplus_inode_write_fork(inode, &vhdr->alloc_file); + fork = &vhdr->alloc_file; break; case HFSPLUS_START_CNID: - if (vhdr->start_file.total_size != cpu_to_be64(inode->i_size)) { - HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; - inode->i_sb->s_dirt = 1; - } - hfsplus_inode_write_fork(inode, &vhdr->start_file); + fork = &vhdr->start_file; break; case HFSPLUS_ATTR_CNID: - if (vhdr->attr_file.total_size != cpu_to_be64(inode->i_size)) { - HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; - inode->i_sb->s_dirt = 1; - } - hfsplus_inode_write_fork(inode, &vhdr->attr_file); - hfs_btree_write(HFSPLUS_SB(inode->i_sb).attr_tree); - break; + fork = &vhdr->attr_file; + tree = sbi->attr_tree; + default: + return -EIO; + } + + if (fork->total_size != cpu_to_be64(inode->i_size)) { + set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags); + inode->i_sb->s_dirt = 1; } - return ret; + hfsplus_inode_write_fork(inode, fork); + if (tree) + hfs_btree_write(tree); + return 0; +} + +static int hfsplus_write_inode(struct inode *inode, + struct writeback_control *wbc) +{ + dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); + + hfsplus_ext_write_extent(inode); + + if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || + inode->i_ino == HFSPLUS_ROOT_CNID) + return hfsplus_cat_write_inode(inode); + else + return hfsplus_system_write_inode(inode); } static void hfsplus_evict_inode(struct inode *inode) @@ -151,51 +148,53 @@ static void hfsplus_evict_inode(struct inode *inode) truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); if (HFSPLUS_IS_RSRC(inode)) { - HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL; - iput(HFSPLUS_I(inode).rsrc_inode); + HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL; + iput(HFSPLUS_I(inode)->rsrc_inode); } } int hfsplus_sync_fs(struct super_block *sb, int wait) { - struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_vh *vhdr = sbi->s_vhdr; dprint(DBG_SUPER, "hfsplus_write_super\n"); - lock_super(sb); + mutex_lock(&sbi->vh_mutex); + mutex_lock(&sbi->alloc_mutex); sb->s_dirt = 0; - vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks); - vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc); - vhdr->next_cnid = cpu_to_be32(HFSPLUS_SB(sb).next_cnid); - vhdr->folder_count = cpu_to_be32(HFSPLUS_SB(sb).folder_count); - vhdr->file_count = cpu_to_be32(HFSPLUS_SB(sb).file_count); + vhdr->free_blocks = cpu_to_be32(sbi->free_blocks); + vhdr->next_cnid = cpu_to_be32(sbi->next_cnid); + vhdr->folder_count = cpu_to_be32(sbi->folder_count); + vhdr->file_count = cpu_to_be32(sbi->file_count); - mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); - if (HFSPLUS_SB(sb).flags & HFSPLUS_SB_WRITEBACKUP) { - if (HFSPLUS_SB(sb).sect_count) { + mark_buffer_dirty(sbi->s_vhbh); + if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) { + if (sbi->sect_count) { struct buffer_head *bh; u32 block, offset; - block = HFSPLUS_SB(sb).blockoffset; - block += (HFSPLUS_SB(sb).sect_count - 2) >> (sb->s_blocksize_bits - 9); - offset = ((HFSPLUS_SB(sb).sect_count - 2) << 9) & (sb->s_blocksize - 1); - printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", HFSPLUS_SB(sb).blockoffset, - HFSPLUS_SB(sb).sect_count, block, offset); + block = sbi->blockoffset; + block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9); + offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1); + printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", + sbi->blockoffset, sbi->sect_count, + block, offset); bh = sb_bread(sb, block); if (bh) { vhdr = (struct hfsplus_vh *)(bh->b_data + offset); if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) { - memcpy(vhdr, HFSPLUS_SB(sb).s_vhdr, sizeof(*vhdr)); + memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr)); mark_buffer_dirty(bh); brelse(bh); } else printk(KERN_WARNING "hfs: backup not found!\n"); } } - HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP; } - unlock_super(sb); + mutex_unlock(&sbi->alloc_mutex); + mutex_unlock(&sbi->vh_mutex); return 0; } @@ -209,48 +208,48 @@ static void hfsplus_write_super(struct super_block *sb) static void hfsplus_put_super(struct super_block *sb) { + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + dprint(DBG_SUPER, "hfsplus_put_super\n"); + if (!sb->s_fs_info) return; - lock_kernel(); - if (sb->s_dirt) hfsplus_write_super(sb); - if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) { - struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; + if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) { + struct hfsplus_vh *vhdr = sbi->s_vhdr; vhdr->modify_date = hfsp_now2mt(); vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT); vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT); - mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); - sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); + mark_buffer_dirty(sbi->s_vhbh); + sync_dirty_buffer(sbi->s_vhbh); } - hfs_btree_close(HFSPLUS_SB(sb).cat_tree); - hfs_btree_close(HFSPLUS_SB(sb).ext_tree); - iput(HFSPLUS_SB(sb).alloc_file); - iput(HFSPLUS_SB(sb).hidden_dir); - brelse(HFSPLUS_SB(sb).s_vhbh); - unload_nls(HFSPLUS_SB(sb).nls); + hfs_btree_close(sbi->cat_tree); + hfs_btree_close(sbi->ext_tree); + iput(sbi->alloc_file); + iput(sbi->hidden_dir); + brelse(sbi->s_vhbh); + unload_nls(sbi->nls); kfree(sb->s_fs_info); sb->s_fs_info = NULL; - - unlock_kernel(); } static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = HFSPLUS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift; - buf->f_bfree = HFSPLUS_SB(sb).free_blocks << HFSPLUS_SB(sb).fs_shift; + buf->f_blocks = sbi->total_blocks << sbi->fs_shift; + buf->f_bfree = sbi->free_blocks << sbi->fs_shift; buf->f_bavail = buf->f_bfree; buf->f_files = 0xFFFFFFFF; - buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid; + buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = HFSPLUS_MAX_STRLEN; @@ -263,11 +262,11 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data) if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; if (!(*flags & MS_RDONLY)) { - struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; + struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr; struct hfsplus_sb_info sbi; memset(&sbi, 0, sizeof(struct hfsplus_sb_info)); - sbi.nls = HFSPLUS_SB(sb).nls; + sbi.nls = HFSPLUS_SB(sb)->nls; if (!hfsplus_parse_options(data, &sbi)) return -EINVAL; @@ -276,7 +275,7 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data) "running fsck.hfsplus is recommended. leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; - } else if (sbi.flags & HFSPLUS_SB_FORCE) { + } else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n"); @@ -320,7 +319,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; sb->s_fs_info = sbi; - INIT_HLIST_HEAD(&sbi->rsrc_inodes); + mutex_init(&sbi->alloc_mutex); + mutex_init(&sbi->vh_mutex); hfsplus_fill_defaults(sbi); if (!hfsplus_parse_options(data, sbi)) { printk(KERN_ERR "hfs: unable to parse mount options\n"); @@ -344,7 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) err = -EINVAL; goto cleanup; } - vhdr = HFSPLUS_SB(sb).s_vhdr; + vhdr = sbi->s_vhdr; /* Copy parts of the volume header into the superblock */ sb->s_magic = HFSPLUS_VOLHEAD_SIG; @@ -353,18 +353,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) printk(KERN_ERR "hfs: wrong filesystem version\n"); goto cleanup; } - HFSPLUS_SB(sb).total_blocks = be32_to_cpu(vhdr->total_blocks); - HFSPLUS_SB(sb).free_blocks = be32_to_cpu(vhdr->free_blocks); - HFSPLUS_SB(sb).next_alloc = be32_to_cpu(vhdr->next_alloc); - HFSPLUS_SB(sb).next_cnid = be32_to_cpu(vhdr->next_cnid); - HFSPLUS_SB(sb).file_count = be32_to_cpu(vhdr->file_count); - HFSPLUS_SB(sb).folder_count = be32_to_cpu(vhdr->folder_count); - HFSPLUS_SB(sb).data_clump_blocks = be32_to_cpu(vhdr->data_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift; - if (!HFSPLUS_SB(sb).data_clump_blocks) - HFSPLUS_SB(sb).data_clump_blocks = 1; - HFSPLUS_SB(sb).rsrc_clump_blocks = be32_to_cpu(vhdr->rsrc_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift; - if (!HFSPLUS_SB(sb).rsrc_clump_blocks) - HFSPLUS_SB(sb).rsrc_clump_blocks = 1; + sbi->total_blocks = be32_to_cpu(vhdr->total_blocks); + sbi->free_blocks = be32_to_cpu(vhdr->free_blocks); + sbi->next_cnid = be32_to_cpu(vhdr->next_cnid); + sbi->file_count = be32_to_cpu(vhdr->file_count); + sbi->folder_count = be32_to_cpu(vhdr->folder_count); + sbi->data_clump_blocks = + be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift; + if (!sbi->data_clump_blocks) + sbi->data_clump_blocks = 1; + sbi->rsrc_clump_blocks = + be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift; + if (!sbi->rsrc_clump_blocks) + sbi->rsrc_clump_blocks = 1; /* Set up operations so we can load metadata */ sb->s_op = &hfsplus_sops; @@ -374,7 +375,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, " "running fsck.hfsplus is recommended. mounting read-only.\n"); sb->s_flags |= MS_RDONLY; - } else if (sbi->flags & HFSPLUS_SB_FORCE) { + } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); @@ -384,16 +385,15 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) "use the force option at your own risk, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } - sbi->flags &= ~HFSPLUS_SB_FORCE; /* Load metadata objects (B*Trees) */ - HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); - if (!HFSPLUS_SB(sb).ext_tree) { + sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); + if (!sbi->ext_tree) { printk(KERN_ERR "hfs: failed to load extents file\n"); goto cleanup; } - HFSPLUS_SB(sb).cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); - if (!HFSPLUS_SB(sb).cat_tree) { + sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); + if (!sbi->cat_tree) { printk(KERN_ERR "hfs: failed to load catalog file\n"); goto cleanup; } @@ -404,7 +404,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) err = PTR_ERR(inode); goto cleanup; } - HFSPLUS_SB(sb).alloc_file = inode; + sbi->alloc_file = inode; /* Load the root directory */ root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID); @@ -423,7 +423,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; str.name = HFSP_HIDDENDIR_NAME; - hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); + hfs_find_init(sbi->cat_tree, &fd); hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { hfs_find_exit(&fd); @@ -434,7 +434,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) err = PTR_ERR(inode); goto cleanup; } - HFSPLUS_SB(sb).hidden_dir = inode; + sbi->hidden_dir = inode; } else hfs_find_exit(&fd); @@ -449,15 +449,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) be32_add_cpu(&vhdr->write_count, 1); vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); - mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); - sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); + mark_buffer_dirty(sbi->s_vhbh); + sync_dirty_buffer(sbi->s_vhbh); - if (!HFSPLUS_SB(sb).hidden_dir) { + if (!sbi->hidden_dir) { printk(KERN_DEBUG "hfs: create hidden dir...\n"); - HFSPLUS_SB(sb).hidden_dir = hfsplus_new_inode(sb, S_IFDIR); - hfsplus_create_cat(HFSPLUS_SB(sb).hidden_dir->i_ino, sb->s_root->d_inode, - &str, HFSPLUS_SB(sb).hidden_dir); - mark_inode_dirty(HFSPLUS_SB(sb).hidden_dir); + + mutex_lock(&sbi->vh_mutex); + sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR); + hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode, + &str, sbi->hidden_dir); + mutex_unlock(&sbi->vh_mutex); + + mark_inode_dirty(sbi->hidden_dir); } out: unload_nls(sbi->nls); @@ -486,7 +490,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb) static void hfsplus_destroy_inode(struct inode *inode) { - kmem_cache_free(hfsplus_inode_cachep, &HFSPLUS_I(inode)); + kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode)); } #define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index 628ccf6fa402..b66d67de882c 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -121,7 +121,7 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc) int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p) { const hfsplus_unichr *ip; - struct nls_table *nls = HFSPLUS_SB(sb).nls; + struct nls_table *nls = HFSPLUS_SB(sb)->nls; u8 *op; u16 cc, c0, c1; u16 *ce1, *ce2; @@ -132,7 +132,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c ustrlen = be16_to_cpu(ustr->length); len = *len_p; ce1 = NULL; - compose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); + compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); while (ustrlen > 0) { c0 = be16_to_cpu(*ip++); @@ -246,7 +246,7 @@ out: static inline int asc2unichar(struct super_block *sb, const char *astr, int len, wchar_t *uc) { - int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc); + int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc); if (size <= 0) { *uc = '?'; size = 1; @@ -293,7 +293,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, u16 *dstr, outlen = 0; wchar_t c; - decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); + decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { size = asc2unichar(sb, astr, len, &c); @@ -330,8 +330,8 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str) wchar_t c; u16 c2; - casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); - decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); + casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); + decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); hash = init_name_hash(); astr = str->name; len = str->len; @@ -373,8 +373,8 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr * u16 c1, c2; wchar_t c; - casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); - decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); + casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); + decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); astr1 = s1->name; len1 = s1->len; astr2 = s2->name; diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index bed78ac8f6d1..8972c20b3216 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -65,8 +65,8 @@ static int hfsplus_get_last_session(struct super_block *sb, *start = 0; *size = sb->s_bdev->bd_inode->i_size >> 9; - if (HFSPLUS_SB(sb).session >= 0) { - te.cdte_track = HFSPLUS_SB(sb).session; + if (HFSPLUS_SB(sb)->session >= 0) { + te.cdte_track = HFSPLUS_SB(sb)->session; te.cdte_format = CDROM_LBA; res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te); if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) { @@ -87,6 +87,7 @@ static int hfsplus_get_last_session(struct super_block *sb, /* Takes in super block, returns true if good data read */ int hfsplus_read_wrapper(struct super_block *sb) { + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct buffer_head *bh; struct hfsplus_vh *vhdr; struct hfsplus_wd wd; @@ -122,7 +123,7 @@ int hfsplus_read_wrapper(struct super_block *sb) if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG)) break; if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) { - HFSPLUS_SB(sb).flags |= HFSPLUS_SB_HFSX; + set_bit(HFSPLUS_SB_HFSX, &sbi->flags); break; } brelse(bh); @@ -143,11 +144,11 @@ int hfsplus_read_wrapper(struct super_block *sb) if (blocksize < HFSPLUS_SECTOR_SIZE || ((blocksize - 1) & blocksize)) return -EINVAL; - HFSPLUS_SB(sb).alloc_blksz = blocksize; - HFSPLUS_SB(sb).alloc_blksz_shift = 0; + sbi->alloc_blksz = blocksize; + sbi->alloc_blksz_shift = 0; while ((blocksize >>= 1) != 0) - HFSPLUS_SB(sb).alloc_blksz_shift++; - blocksize = min(HFSPLUS_SB(sb).alloc_blksz, (u32)PAGE_SIZE); + sbi->alloc_blksz_shift++; + blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE); /* align block size to block offset */ while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1)) @@ -158,23 +159,26 @@ int hfsplus_read_wrapper(struct super_block *sb) return -EINVAL; } - HFSPLUS_SB(sb).blockoffset = part_start >> - (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT); - HFSPLUS_SB(sb).sect_count = part_size; - HFSPLUS_SB(sb).fs_shift = HFSPLUS_SB(sb).alloc_blksz_shift - - sb->s_blocksize_bits; + sbi->blockoffset = + part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT); + sbi->sect_count = part_size; + sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits; bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr); if (!bh) return -EIO; /* should still be the same... */ - if (vhdr->signature != (HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX ? - cpu_to_be16(HFSPLUS_VOLHEAD_SIGX) : - cpu_to_be16(HFSPLUS_VOLHEAD_SIG))) - goto error; - HFSPLUS_SB(sb).s_vhbh = bh; - HFSPLUS_SB(sb).s_vhdr = vhdr; + if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) { + if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) + goto error; + } else { + if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG)) + goto error; + } + + sbi->s_vhbh = bh; + sbi->s_vhdr = vhdr; return 0; error: diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index cdfb8c6a4206..c16f8d8331b5 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -196,8 +196,6 @@ fh_lock(struct svc_fh *fhp) static inline void fh_unlock(struct svc_fh *fhp) { - BUG_ON(!fhp->fh_dentry); - if (fhp->fh_locked) { fill_post_wcc(fhp); mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex); diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig index 22c629eedd82..b388443c3a09 100644 --- a/fs/notify/Kconfig +++ b/fs/notify/Kconfig @@ -3,4 +3,4 @@ config FSNOTIFY source "fs/notify/dnotify/Kconfig" source "fs/notify/inotify/Kconfig" -source "fs/notify/fanotify/Kconfig" +#source "fs/notify/fanotify/Kconfig" diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index d59c4a65d492..81976ffed7d6 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -668,14 +668,11 @@ xfs_inode_set_reclaim_tag( xfs_perag_put(pag); } -void -__xfs_inode_clear_reclaim_tag( - xfs_mount_t *mp, +STATIC void +__xfs_inode_clear_reclaim( xfs_perag_t *pag, xfs_inode_t *ip) { - radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); pag->pag_ici_reclaimable--; if (!pag->pag_ici_reclaimable) { /* clear the reclaim tag from the perag radix tree */ @@ -689,6 +686,17 @@ __xfs_inode_clear_reclaim_tag( } } +void +__xfs_inode_clear_reclaim_tag( + xfs_mount_t *mp, + xfs_perag_t *pag, + xfs_inode_t *ip) +{ + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + __xfs_inode_clear_reclaim(pag, ip); +} + /* * Inodes in different states need to be treated differently, and the return * value of xfs_iflush is not sufficient to get this right. The following table @@ -838,6 +846,7 @@ reclaim: if (!radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) ASSERT(0); + __xfs_inode_clear_reclaim(pag, ip); write_unlock(&pag->pag_ici_lock); /* diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h index 267a86c74e2e..2040e6c4f172 100644 --- a/include/drm/ttm/ttm_bo_api.h +++ b/include/drm/ttm/ttm_bo_api.h @@ -246,9 +246,11 @@ struct ttm_buffer_object { atomic_t reserved; - /** * Members protected by the bo::lock + * In addition, setting sync_obj to anything else + * than NULL requires bo::reserved to be held. This allows for + * checking NULL while reserved but not holding bo::lock. */ void *sync_obj_arg; diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 626b629429ff..4e8ea8c8ec1e 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -118,7 +118,6 @@ header-y += eventpoll.h header-y += ext2_fs.h header-y += fadvise.h header-y += falloc.h -header-y += fanotify.h header-y += fb.h header-y += fcntl.h header-y += fd.h diff --git a/fs/ceph/auth.h b/include/linux/ceph/auth.h index d38a2fb4a137..7fff521d7eb5 100644 --- a/fs/ceph/auth.h +++ b/include/linux/ceph/auth.h @@ -1,8 +1,8 @@ #ifndef _FS_CEPH_AUTH_H #define _FS_CEPH_AUTH_H -#include "types.h" -#include "buffer.h" +#include <linux/ceph/types.h> +#include <linux/ceph/buffer.h> /* * Abstract interface for communicating with the authenticate module. diff --git a/fs/ceph/buffer.h b/include/linux/ceph/buffer.h index 58d19014068f..58d19014068f 100644 --- a/fs/ceph/buffer.h +++ b/include/linux/ceph/buffer.h diff --git a/fs/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h index 1818c2305610..aa2e19182d99 100644 --- a/fs/ceph/ceph_debug.h +++ b/include/linux/ceph/ceph_debug.h @@ -3,7 +3,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#ifdef CONFIG_CEPH_FS_PRETTYDEBUG +#ifdef CONFIG_CEPH_LIB_PRETTYDEBUG /* * wrap pr_debug to include a filename:lineno prefix on each line. @@ -14,7 +14,8 @@ # if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) extern const char *ceph_file_part(const char *s, int len); # define dout(fmt, ...) \ - pr_debug(" %12.12s:%-4d : " fmt, \ + pr_debug("%.*s %12.12s:%-4d : " fmt, \ + 8 - (int)sizeof(KBUILD_MODNAME), " ", \ ceph_file_part(__FILE__, sizeof(__FILE__)), \ __LINE__, ##__VA_ARGS__) # else diff --git a/fs/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h index 5babb8e95352..5babb8e95352 100644 --- a/fs/ceph/ceph_frag.h +++ b/include/linux/ceph/ceph_frag.h diff --git a/fs/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index d5619ac86711..c3c74aef289d 100644 --- a/fs/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -299,6 +299,7 @@ enum { CEPH_MDS_OP_SETATTR = 0x01108, CEPH_MDS_OP_SETFILELOCK= 0x01109, CEPH_MDS_OP_GETFILELOCK= 0x00110, + CEPH_MDS_OP_SETDIRLAYOUT=0x0110a, CEPH_MDS_OP_MKNOD = 0x01201, CEPH_MDS_OP_LINK = 0x01202, diff --git a/fs/ceph/ceph_hash.h b/include/linux/ceph/ceph_hash.h index d099c3f90236..d099c3f90236 100644 --- a/fs/ceph/ceph_hash.h +++ b/include/linux/ceph/ceph_hash.h diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h new file mode 100644 index 000000000000..2a79702e092b --- /dev/null +++ b/include/linux/ceph/debugfs.h @@ -0,0 +1,33 @@ +#ifndef _FS_CEPH_DEBUGFS_H +#define _FS_CEPH_DEBUGFS_H + +#include "ceph_debug.h" +#include "types.h" + +#define CEPH_DEFINE_SHOW_FUNC(name) \ +static int name##_open(struct inode *inode, struct file *file) \ +{ \ + struct seq_file *sf; \ + int ret; \ + \ + ret = single_open(file, name, NULL); \ + sf = file->private_data; \ + sf->private = inode->i_private; \ + return ret; \ +} \ + \ +static const struct file_operations name##_fops = { \ + .open = name##_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; + +/* debugfs.c */ +extern int ceph_debugfs_init(void); +extern void ceph_debugfs_cleanup(void); +extern int ceph_debugfs_client_init(struct ceph_client *client); +extern void ceph_debugfs_client_cleanup(struct ceph_client *client); + +#endif + diff --git a/fs/ceph/decode.h b/include/linux/ceph/decode.h index 3d25415afe63..c5b6939fb32a 100644 --- a/fs/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -191,6 +191,11 @@ static inline void ceph_encode_string(void **p, void *end, ceph_encode_need(p, end, n, bad); \ ceph_encode_copy(p, pv, n); \ } while (0) +#define ceph_encode_string_safe(p, end, s, n, bad) \ + do { \ + ceph_encode_need(p, end, n, bad); \ + ceph_encode_string(p, end, s, n); \ + } while (0) #endif diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h new file mode 100644 index 000000000000..f22b2e941686 --- /dev/null +++ b/include/linux/ceph/libceph.h @@ -0,0 +1,249 @@ +#ifndef _FS_CEPH_LIBCEPH_H +#define _FS_CEPH_LIBCEPH_H + +#include "ceph_debug.h" + +#include <asm/unaligned.h> +#include <linux/backing-dev.h> +#include <linux/completion.h> +#include <linux/exportfs.h> +#include <linux/fs.h> +#include <linux/mempool.h> +#include <linux/pagemap.h> +#include <linux/wait.h> +#include <linux/writeback.h> +#include <linux/slab.h> + +#include "types.h" +#include "messenger.h" +#include "msgpool.h" +#include "mon_client.h" +#include "osd_client.h" +#include "ceph_fs.h" + +/* + * Supported features + */ +#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_REQUIRED_DEFAULT CEPH_FEATURE_NOSRCADDR + +/* + * mount options + */ +#define CEPH_OPT_FSID (1<<0) +#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */ +#define CEPH_OPT_MYIP (1<<2) /* specified my ip */ +#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */ + +#define CEPH_OPT_DEFAULT (0); + +#define ceph_set_opt(client, opt) \ + (client)->options->flags |= CEPH_OPT_##opt; +#define ceph_test_opt(client, opt) \ + (!!((client)->options->flags & CEPH_OPT_##opt)) + +struct ceph_options { + int flags; + struct ceph_fsid fsid; + struct ceph_entity_addr my_addr; + int mount_timeout; + int osd_idle_ttl; + int osd_timeout; + int osd_keepalive_timeout; + + /* + * any type that can't be simply compared or doesn't need need + * to be compared should go beyond this point, + * ceph_compare_options() should be updated accordingly + */ + + struct ceph_entity_addr *mon_addr; /* should be the first + pointer type of args */ + int num_mon; + char *name; + char *secret; +}; + +/* + * defaults + */ +#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 +#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */ +#define CEPH_OSD_KEEPALIVE_DEFAULT 5 +#define CEPH_OSD_IDLE_TTL_DEFAULT 60 +#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ + +#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) +#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) + +#define CEPH_AUTH_NAME_DEFAULT "guest" + +/* + * Delay telling the MDS we no longer want caps, in case we reopen + * the file. Delay a minimum amount of time, even if we send a cap + * message for some other reason. Otherwise, take the oppotunity to + * update the mds to avoid sending another message later. + */ +#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ +#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ + +#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4) + +/* mount state */ +enum { + CEPH_MOUNT_MOUNTING, + CEPH_MOUNT_MOUNTED, + CEPH_MOUNT_UNMOUNTING, + CEPH_MOUNT_UNMOUNTED, + CEPH_MOUNT_SHUTDOWN, +}; + +/* + * subtract jiffies + */ +static inline unsigned long time_sub(unsigned long a, unsigned long b) +{ + BUG_ON(time_after(b, a)); + return (long)a - (long)b; +} + +struct ceph_mds_client; + +/* + * per client state + * + * possibly shared by multiple mount points, if they are + * mounting the same ceph filesystem/cluster. + */ +struct ceph_client { + struct ceph_fsid fsid; + bool have_fsid; + + void *private; + + struct ceph_options *options; + + struct mutex mount_mutex; /* serialize mount attempts */ + wait_queue_head_t auth_wq; + int auth_err; + + int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *); + + u32 supported_features; + u32 required_features; + + struct ceph_messenger *msgr; /* messenger instance */ + struct ceph_mon_client monc; + struct ceph_osd_client osdc; + +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_dir; + struct dentry *debugfs_monmap; + struct dentry *debugfs_osdmap; +#endif +}; + + + +/* + * snapshots + */ + +/* + * A "snap context" is the set of existing snapshots when we + * write data. It is used by the OSD to guide its COW behavior. + * + * The ceph_snap_context is refcounted, and attached to each dirty + * page, indicating which context the dirty data belonged when it was + * dirtied. + */ +struct ceph_snap_context { + atomic_t nref; + u64 seq; + int num_snaps; + u64 snaps[]; +}; + +static inline struct ceph_snap_context * +ceph_get_snap_context(struct ceph_snap_context *sc) +{ + /* + printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), + atomic_read(&sc->nref)+1); + */ + if (sc) + atomic_inc(&sc->nref); + return sc; +} + +static inline void ceph_put_snap_context(struct ceph_snap_context *sc) +{ + if (!sc) + return; + /* + printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), + atomic_read(&sc->nref)-1); + */ + if (atomic_dec_and_test(&sc->nref)) { + /*printk(" deleting snap_context %p\n", sc);*/ + kfree(sc); + } +} + +/* + * calculate the number of pages a given length and offset map onto, + * if we align the data. + */ +static inline int calc_pages_for(u64 off, u64 len) +{ + return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - + (off >> PAGE_CACHE_SHIFT); +} + +/* ceph_common.c */ +extern const char *ceph_msg_type_name(int type); +extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); +extern struct kmem_cache *ceph_inode_cachep; +extern struct kmem_cache *ceph_cap_cachep; +extern struct kmem_cache *ceph_dentry_cachep; +extern struct kmem_cache *ceph_file_cachep; + +extern int ceph_parse_options(struct ceph_options **popt, char *options, + const char *dev_name, const char *dev_name_end, + int (*parse_extra_token)(char *c, void *private), + void *private); +extern void ceph_destroy_options(struct ceph_options *opt); +extern int ceph_compare_options(struct ceph_options *new_opt, + struct ceph_client *client); +extern struct ceph_client *ceph_create_client(struct ceph_options *opt, + void *private); +extern u64 ceph_client_id(struct ceph_client *client); +extern void ceph_destroy_client(struct ceph_client *client); +extern int __ceph_open_session(struct ceph_client *client, + unsigned long started); +extern int ceph_open_session(struct ceph_client *client); + +/* pagevec.c */ +extern void ceph_release_page_vector(struct page **pages, int num_pages); + +extern struct page **ceph_get_direct_page_vector(const char __user *data, + int num_pages, + loff_t off, size_t len); +extern void ceph_put_page_vector(struct page **pages, int num_pages); +extern void ceph_release_page_vector(struct page **pages, int num_pages); +extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); +extern int ceph_copy_user_to_page_vector(struct page **pages, + const char __user *data, + loff_t off, size_t len); +extern int ceph_copy_to_page_vector(struct page **pages, + const char *data, + loff_t off, size_t len); +extern int ceph_copy_from_page_vector(struct page **pages, + char *data, + loff_t off, size_t len); +extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data, + loff_t off, size_t len); +extern void ceph_zero_page_vector_range(int off, int len, struct page **pages); + + +#endif /* _FS_CEPH_SUPER_H */ diff --git a/fs/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h index 4c5cb0880bba..4c5cb0880bba 100644 --- a/fs/ceph/mdsmap.h +++ b/include/linux/ceph/mdsmap.h diff --git a/fs/ceph/messenger.h b/include/linux/ceph/messenger.h index 76fbc957bc13..5956d62c3057 100644 --- a/fs/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -65,6 +65,9 @@ struct ceph_messenger { */ u32 global_seq; spinlock_t global_seq_lock; + + u32 supported_features; + u32 required_features; }; /* @@ -82,6 +85,10 @@ struct ceph_msg { struct ceph_pagelist *pagelist; /* instead of pages */ struct list_head list_head; struct kref kref; + struct bio *bio; /* instead of pages/pagelist */ + struct bio *bio_iter; /* bio iterator */ + int bio_seg; /* current bio segment */ + struct ceph_pagelist *trail; /* the trailing part of the data */ bool front_is_vmalloc; bool more_to_follow; bool needs_out_seq; @@ -205,7 +212,7 @@ struct ceph_connection { }; -extern const char *pr_addr(const struct sockaddr_storage *ss); +extern const char *ceph_pr_addr(const struct sockaddr_storage *ss); extern int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, int max_count, int *count); @@ -216,7 +223,8 @@ extern void ceph_msgr_exit(void); extern void ceph_msgr_flush(void); extern struct ceph_messenger *ceph_messenger_create( - struct ceph_entity_addr *myaddr); + struct ceph_entity_addr *myaddr, + u32 features, u32 required); extern void ceph_messenger_destroy(struct ceph_messenger *); extern void ceph_con_init(struct ceph_messenger *msgr, diff --git a/fs/ceph/mon_client.h b/include/linux/ceph/mon_client.h index 8e396f2c0963..545f85917780 100644 --- a/fs/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -79,6 +79,7 @@ struct ceph_mon_client { u64 last_tid; /* mds/osd map */ + int want_mdsmap; int want_next_osdmap; /* 1 = want, 2 = want+asked */ u32 have_osdmap, have_mdsmap; diff --git a/fs/ceph/msgpool.h b/include/linux/ceph/msgpool.h index a362605f9368..a362605f9368 100644 --- a/fs/ceph/msgpool.h +++ b/include/linux/ceph/msgpool.h diff --git a/fs/ceph/msgr.h b/include/linux/ceph/msgr.h index 680d3d648cac..680d3d648cac 100644 --- a/fs/ceph/msgr.h +++ b/include/linux/ceph/msgr.h diff --git a/fs/ceph/osd_client.h b/include/linux/ceph/osd_client.h index ce776989ef6a..6c91fb032c39 100644 --- a/fs/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -15,6 +15,7 @@ struct ceph_snap_context; struct ceph_osd_request; struct ceph_osd_client; struct ceph_authorizer; +struct ceph_pagelist; /* * completion callback for async writepages @@ -68,6 +69,7 @@ struct ceph_osd_request { struct list_head r_unsafe_item; struct inode *r_inode; /* for use by callbacks */ + void *r_priv; /* ditto */ char r_oid[40]; /* object name */ int r_oid_len; @@ -80,6 +82,11 @@ struct ceph_osd_request { struct page **r_pages; /* pages for data payload */ int r_pages_from_pool; int r_own_pages; /* if true, i own page list */ +#ifdef CONFIG_BLOCK + struct bio *r_bio; /* instead of pages */ +#endif + + struct ceph_pagelist *r_trail; /* trailing part of the data */ }; struct ceph_osd_client { @@ -110,6 +117,42 @@ struct ceph_osd_client { struct ceph_msgpool msgpool_op_reply; }; +struct ceph_osd_req_op { + u16 op; /* CEPH_OSD_OP_* */ + u32 flags; /* CEPH_OSD_FLAG_* */ + union { + struct { + u64 offset, length; + u64 truncate_size; + u32 truncate_seq; + } extent; + struct { + const char *name; + u32 name_len; + const char *val; + u32 value_len; + __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ + __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ + } xattr; + struct { + const char *class_name; + __u8 class_len; + const char *method_name; + __u8 method_len; + __u8 argc; + const char *indata; + u32 indata_len; + } cls; + struct { + u64 cookie, count; + } pgls; + struct { + u64 snapid; + } snap; + }; + u32 payload_len; +}; + extern int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client); extern void ceph_osdc_stop(struct ceph_osd_client *osdc); @@ -119,6 +162,30 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); +extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc, + struct ceph_file_layout *layout, + u64 snapid, + u64 off, u64 *plen, u64 *bno, + struct ceph_osd_request *req, + struct ceph_osd_req_op *op); + +extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, + int flags, + struct ceph_snap_context *snapc, + struct ceph_osd_req_op *ops, + bool use_mempool, + gfp_t gfp_flags, + struct page **pages, + struct bio *bio); + +extern void ceph_osdc_build_request(struct ceph_osd_request *req, + u64 off, u64 *plen, + struct ceph_osd_req_op *src_ops, + struct ceph_snap_context *snapc, + struct timespec *mtime, + const char *oid, + int oid_len); + extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, struct ceph_file_layout *layout, struct ceph_vino vino, diff --git a/fs/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 970b547e510d..ba4c205cbb01 100644 --- a/fs/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -4,7 +4,7 @@ #include <linux/rbtree.h> #include "types.h" #include "ceph_fs.h" -#include "crush/crush.h" +#include <linux/crush/crush.h> /* * The osd map describes the current membership of the osd cluster and @@ -125,4 +125,6 @@ extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid); +extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name); + #endif diff --git a/fs/ceph/pagelist.h b/include/linux/ceph/pagelist.h index e8a4187e1087..9660d6b0a35d 100644 --- a/fs/ceph/pagelist.h +++ b/include/linux/ceph/pagelist.h @@ -8,6 +8,14 @@ struct ceph_pagelist { void *mapped_tail; size_t length; size_t room; + struct list_head free_list; + size_t num_pages_free; +}; + +struct ceph_pagelist_cursor { + struct ceph_pagelist *pl; /* pagelist, for error checking */ + struct list_head *page_lru; /* page in list */ + size_t room; /* room remaining to reset to */ }; static inline void ceph_pagelist_init(struct ceph_pagelist *pl) @@ -16,10 +24,23 @@ static inline void ceph_pagelist_init(struct ceph_pagelist *pl) pl->mapped_tail = NULL; pl->length = 0; pl->room = 0; + INIT_LIST_HEAD(&pl->free_list); + pl->num_pages_free = 0; } + extern int ceph_pagelist_release(struct ceph_pagelist *pl); -extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l); +extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l); + +extern int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space); + +extern int ceph_pagelist_free_reserve(struct ceph_pagelist *pl); + +extern void ceph_pagelist_set_cursor(struct ceph_pagelist *pl, + struct ceph_pagelist_cursor *c); + +extern int ceph_pagelist_truncate(struct ceph_pagelist *pl, + struct ceph_pagelist_cursor *c); static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v) { diff --git a/fs/ceph/rados.h b/include/linux/ceph/rados.h index 6d5247f2e81b..6d5247f2e81b 100644 --- a/fs/ceph/rados.h +++ b/include/linux/ceph/rados.h diff --git a/fs/ceph/types.h b/include/linux/ceph/types.h index 28b35a005ec2..28b35a005ec2 100644 --- a/fs/ceph/types.h +++ b/include/linux/ceph/types.h diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 0c991023ee47..709dfb901d11 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -75,7 +75,7 @@ struct cgroup_subsys_state { unsigned long flags; /* ID for this css, if possible */ - struct css_id *id; + struct css_id __rcu *id; }; /* bits in struct cgroup_subsys_state flags field */ @@ -205,7 +205,7 @@ struct cgroup { struct list_head children; /* my children */ struct cgroup *parent; /* my parent */ - struct dentry *dentry; /* cgroup fs entry, RCU protected */ + struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */ /* Private pointers for each registered subsystem */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; diff --git a/include/linux/compiler.h b/include/linux/compiler.h index c1a62c56a660..320d6c94ff84 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -16,7 +16,11 @@ # define __release(x) __context__(x,-1) # define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) # define __percpu __attribute__((noderef, address_space(3))) +#ifdef CONFIG_SPARSE_RCU_POINTER +# define __rcu __attribute__((noderef, address_space(4))) +#else # define __rcu +#endif extern void __chk_user_ptr(const volatile void __user *); extern void __chk_io_ptr(const volatile void __iomem *); #else diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 8ba66a9d9022..ba4b85a6d9b8 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -9,37 +9,7 @@ * These are the only things you should do on a core-file: use only these * functions to write out all the necessary info. */ -static inline int dump_write(struct file *file, const void *addr, int nr) -{ - return file->f_op->write(file, addr, nr, &file->f_pos) == nr; -} - -static inline int dump_seek(struct file *file, loff_t off) -{ - int ret = 1; - - if (file->f_op->llseek && file->f_op->llseek != no_llseek) { - if (file->f_op->llseek(file, off, SEEK_CUR) < 0) - return 0; - } else { - char *buf = (char *)get_zeroed_page(GFP_KERNEL); - - if (!buf) - return 0; - while (off > 0) { - unsigned long n = off; - - if (n > PAGE_SIZE) - n = PAGE_SIZE; - if (!dump_write(file, buf, n)) { - ret = 0; - break; - } - off -= n; - } - free_page((unsigned long)buf); - } - return ret; -} +extern int dump_write(struct file *file, const void *addr, int nr); +extern int dump_seek(struct file *file, loff_t off); #endif /* _LINUX_COREDUMP_H */ diff --git a/include/linux/cred.h b/include/linux/cred.h index 4d2c39573f36..4aaeab376446 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -84,7 +84,7 @@ struct thread_group_cred { atomic_t usage; pid_t tgid; /* thread group process ID */ spinlock_t lock; - struct key *session_keyring; /* keyring inherited over fork */ + struct key __rcu *session_keyring; /* keyring inherited over fork */ struct key *process_keyring; /* keyring private to this process */ struct rcu_head rcu; /* RCU deletion hook */ }; diff --git a/fs/ceph/crush/crush.h b/include/linux/crush/crush.h index 97e435b191f4..97e435b191f4 100644 --- a/fs/ceph/crush/crush.h +++ b/include/linux/crush/crush.h diff --git a/fs/ceph/crush/hash.h b/include/linux/crush/hash.h index 91e884230d5d..91e884230d5d 100644 --- a/fs/ceph/crush/hash.h +++ b/include/linux/crush/hash.h diff --git a/fs/ceph/crush/mapper.h b/include/linux/crush/mapper.h index c46b99c18bb0..c46b99c18bb0 100644 --- a/fs/ceph/crush/mapper.h +++ b/include/linux/crush/mapper.h diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index 29b3ce3f2a1d..2833452ea01c 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -49,7 +49,6 @@ struct task_struct; #ifdef CONFIG_LOCKDEP extern void debug_show_all_locks(void); -extern void __debug_show_held_locks(struct task_struct *task); extern void debug_show_held_locks(struct task_struct *task); extern void debug_check_no_locks_freed(const void *from, unsigned long len); extern void debug_check_no_locks_held(struct task_struct *task); @@ -58,10 +57,6 @@ static inline void debug_show_all_locks(void) { } -static inline void __debug_show_held_locks(struct task_struct *task) -{ -} - static inline void debug_show_held_locks(struct task_struct *task) { } diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 926b50322a46..4fd978e7eb83 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -93,6 +93,7 @@ struct elevator_queue struct elevator_type *elevator_type; struct mutex sysfs_lock; struct hlist_head *hash; + unsigned int registered:1; }; /* diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index f59ed297b661..133c0ba25e30 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -31,7 +31,7 @@ struct embedded_fd_set { struct fdtable { unsigned int max_fds; - struct file ** fd; /* current fd array */ + struct file __rcu **fd; /* current fd array */ fd_set *close_on_exec; fd_set *open_fds; struct rcu_head rcu; @@ -46,7 +46,7 @@ struct files_struct { * read mostly part */ atomic_t count; - struct fdtable *fdt; + struct fdtable __rcu *fdt; struct fdtable fdtab; /* * written part on a separate cache line in SMP @@ -55,7 +55,7 @@ struct files_struct { int next_fd; struct embedded_fd_set close_on_exec_init; struct embedded_fd_set open_fds_init; - struct file * fd_array[NR_OPEN_DEFAULT]; + struct file __rcu * fd_array[NR_OPEN_DEFAULT]; }; #define rcu_dereference_check_fdtable(files, fdtfd) \ diff --git a/include/linux/fs.h b/include/linux/fs.h index 63d069bd80b7..3168dcfb94f2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1384,7 +1384,7 @@ struct super_block { * Saved mount options for lazy filesystems using * generic_show_options() */ - char *s_options; + char __rcu *s_options; }; extern struct timespec current_fs_time(struct super_block *sb); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 5f2f4c4d8fb0..af3f06b41dc1 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -129,8 +129,8 @@ struct blk_scsi_cmd_filter { struct disk_part_tbl { struct rcu_head rcu_head; int len; - struct hd_struct *last_lookup; - struct hd_struct *part[]; + struct hd_struct __rcu *last_lookup; + struct hd_struct __rcu *part[]; }; struct gendisk { @@ -149,7 +149,7 @@ struct gendisk { * non-critical accesses use RCU. Always access through * helpers. */ - struct disk_part_tbl *part_tbl; + struct disk_part_tbl __rcu *part_tbl; struct hd_struct part0; const struct block_device_operations *fops; diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index d5b387669dab..1f4517d55b19 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -139,7 +139,7 @@ static inline void account_system_vtime(struct task_struct *tsk) #endif #if defined(CONFIG_NO_HZ) -#if defined(CONFIG_TINY_RCU) +#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) extern void rcu_enter_nohz(void); extern void rcu_exit_nohz(void); diff --git a/include/linux/idr.h b/include/linux/idr.h index e968db71e33a..cdb715e58e3e 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -50,14 +50,14 @@ struct idr_layer { unsigned long bitmap; /* A zero bit means "space here" */ - struct idr_layer *ary[1<<IDR_BITS]; + struct idr_layer __rcu *ary[1<<IDR_BITS]; int count; /* When zero, we can release it */ int layer; /* distance from leaf */ struct rcu_head rcu_head; }; struct idr { - struct idr_layer *top; + struct idr_layer __rcu *top; struct idr_layer *id_free; int layers; /* only valid without concurrent changes */ int id_free_cnt; diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 1f43fa56f600..2fea6c8ef6ba 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -82,11 +82,17 @@ extern struct group_info init_groups; # define CAP_INIT_BSET CAP_FULL_SET #ifdef CONFIG_TREE_PREEMPT_RCU +#define INIT_TASK_RCU_TREE_PREEMPT() \ + .rcu_blocked_node = NULL, +#else +#define INIT_TASK_RCU_TREE_PREEMPT(tsk) +#endif +#ifdef CONFIG_PREEMPT_RCU #define INIT_TASK_RCU_PREEMPT(tsk) \ .rcu_read_lock_nesting = 0, \ .rcu_read_unlock_special = 0, \ - .rcu_blocked_node = NULL, \ - .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), + .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \ + INIT_TASK_RCU_TREE_PREEMPT() #else #define INIT_TASK_RCU_PREEMPT(tsk) #endif @@ -137,8 +143,8 @@ extern struct cred init_cred; .children = LIST_HEAD_INIT(tsk.children), \ .sibling = LIST_HEAD_INIT(tsk.sibling), \ .group_leader = &tsk, \ - .real_cred = &init_cred, \ - .cred = &init_cred, \ + RCU_INIT_POINTER(.real_cred, &init_cred), \ + RCU_INIT_POINTER(.cred, &init_cred), \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(tsk.cred_guard_mutex), \ .comm = "swapper", \ diff --git a/include/linux/input.h b/include/linux/input.h index 896a92227bc4..d6ae1761be97 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -1196,7 +1196,7 @@ struct input_dev { int (*flush)(struct input_dev *dev, struct file *file); int (*event)(struct input_dev *dev, unsigned int type, unsigned int code, int value); - struct input_handle *grab; + struct input_handle __rcu *grab; spinlock_t event_lock; struct mutex mutex; diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 64d529133031..3e70b21884a9 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -53,7 +53,7 @@ struct io_context { struct radix_tree_root radix_root; struct hlist_head cic_list; - void *ioc_data; + void __rcu *ioc_data; }; static inline struct io_context *ioc_task_link(struct io_context *ioc) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2b0a35e6bc69..1759ba5adce8 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -58,7 +58,18 @@ extern const char linux_proc_banner[]; #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) -#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) +#define roundup(x, y) ( \ +{ \ + typeof(y) __y = y; \ + (((x) + (__y - 1)) / __y) * __y; \ +} \ +) +#define rounddown(x, y) ( \ +{ \ + typeof(x) __x = (x); \ + __x - (__x % (y)); \ +} \ +) #define DIV_ROUND_CLOSEST(x, divisor)( \ { \ typeof(divisor) __divisor = divisor; \ diff --git a/include/linux/key.h b/include/linux/key.h index cd50dfa1d4c2..3db0adce1fda 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -178,8 +178,9 @@ struct key { */ union { unsigned long value; + void __rcu *rcudata; void *data; - struct keyring_list *subscriptions; + struct keyring_list __rcu *subscriptions; } payload; }; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c13cc48697aa..ac740b26eb10 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -205,7 +205,7 @@ struct kvm { struct mutex irq_lock; #ifdef CONFIG_HAVE_KVM_IRQCHIP - struct kvm_irq_routing_table *irq_routing; + struct kvm_irq_routing_table __rcu *irq_routing; struct hlist_head mask_notifier_list; struct hlist_head irq_ack_notifier_list; #endif diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 06aed8305bf3..2186a64ee4b5 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -32,6 +32,17 @@ extern int lock_stat; #define MAX_LOCKDEP_SUBCLASSES 8UL /* + * NR_LOCKDEP_CACHING_CLASSES ... Number of classes + * cached in the instance of lockdep_map + * + * Currently main class (subclass == 0) and signle depth subclass + * are cached in lockdep_map. This optimization is mainly targeting + * on rq->lock. double_rq_lock() acquires this highly competitive with + * single depth. + */ +#define NR_LOCKDEP_CACHING_CLASSES 2 + +/* * Lock-classes are keyed via unique addresses, by embedding the * lockclass-key into the kernel (or module) .data section. (For * static locks we use the lock address itself as the key.) @@ -138,7 +149,7 @@ void clear_lock_stats(struct lock_class *class); */ struct lockdep_map { struct lock_class_key *key; - struct lock_class *class_cache; + struct lock_class *class_cache[NR_LOCKDEP_CACHING_CLASSES]; const char *name; #ifdef CONFIG_LOCK_STAT int cpu; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ee7e258627f9..cb57d657ce4d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -299,7 +299,7 @@ struct mm_struct { * new_owner->mm == mm * new_owner->alloc_lock is held */ - struct task_struct *owner; + struct task_struct __rcu *owner; #endif #ifdef CONFIG_PROC_FS diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h index 9ed534c991b9..70cd0603911c 100644 --- a/include/linux/netfilter/nfnetlink_conntrack.h +++ b/include/linux/netfilter/nfnetlink_conntrack.h @@ -39,8 +39,9 @@ enum ctattr_type { CTA_TUPLE_MASTER, CTA_NAT_SEQ_ADJ_ORIG, CTA_NAT_SEQ_ADJ_REPLY, - CTA_SECMARK, + CTA_SECMARK, /* obsolete */ CTA_ZONE, + CTA_SECCTX, __CTA_MAX }; #define CTA_MAX (__CTA_MAX - 1) @@ -172,4 +173,11 @@ enum ctattr_help { }; #define CTA_HELP_MAX (__CTA_HELP_MAX - 1) +enum ctattr_secctx { + CTA_SECCTX_UNSPEC, + CTA_SECCTX_NAME, + __CTA_SECCTX_MAX +}; +#define CTA_SECCTX_MAX (__CTA_SECCTX_MAX - 1) + #endif /* _IPCONNTRACK_NETLINK_H */ diff --git a/include/linux/netfilter/xt_SECMARK.h b/include/linux/netfilter/xt_SECMARK.h index 6fcd3448b186..989092bd6274 100644 --- a/include/linux/netfilter/xt_SECMARK.h +++ b/include/linux/netfilter/xt_SECMARK.h @@ -11,18 +11,12 @@ * packets are being marked for. */ #define SECMARK_MODE_SEL 0x01 /* SELinux */ -#define SECMARK_SELCTX_MAX 256 - -struct xt_secmark_target_selinux_info { - __u32 selsid; - char selctx[SECMARK_SELCTX_MAX]; -}; +#define SECMARK_SECCTX_MAX 256 struct xt_secmark_target_info { __u8 mode; - union { - struct xt_secmark_target_selinux_info sel; - } u; + __u32 secid; + char secctx[SECMARK_SECCTX_MAX]; }; #endif /*_XT_SECMARK_H_target */ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 508f8cf6da37..d0edf7d823ae 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -185,7 +185,7 @@ struct nfs_inode { struct nfs4_cached_acl *nfs4_acl; /* NFSv4 state */ struct list_head open_states; - struct nfs_delegation *delegation; + struct nfs_delegation __rcu *delegation; fmode_t delegation_state; struct rw_semaphore rwsem; #endif /* CONFIG_NFS_V4*/ diff --git a/include/linux/notifier.h b/include/linux/notifier.h index b2f1a4d83550..2026f9e1ceb8 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -49,28 +49,28 @@ struct notifier_block { int (*notifier_call)(struct notifier_block *, unsigned long, void *); - struct notifier_block *next; + struct notifier_block __rcu *next; int priority; }; struct atomic_notifier_head { spinlock_t lock; - struct notifier_block *head; + struct notifier_block __rcu *head; }; struct blocking_notifier_head { struct rw_semaphore rwsem; - struct notifier_block *head; + struct notifier_block __rcu *head; }; struct raw_notifier_head { - struct notifier_block *head; + struct notifier_block __rcu *head; }; struct srcu_notifier_head { struct mutex mutex; struct srcu_struct srcu; - struct notifier_block *head; + struct notifier_block __rcu *head; }; #define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \ diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 634b8e674ac5..a39cbed9ee17 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -47,6 +47,8 @@ static inline void *radix_tree_indirect_to_ptr(void *ptr) { return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); } +#define radix_tree_indirect_to_ptr(ptr) \ + radix_tree_indirect_to_ptr((void __force *)(ptr)) static inline int radix_tree_is_indirect_ptr(void *ptr) { @@ -61,7 +63,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr) struct radix_tree_root { unsigned int height; gfp_t gfp_mask; - struct radix_tree_node *rnode; + struct radix_tree_node __rcu *rnode; }; #define RADIX_TREE_INIT(mask) { \ diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 4ec3b38ce9c5..f31ef61f1c65 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -10,6 +10,21 @@ #include <linux/rcupdate.h> /* + * Why is there no list_empty_rcu()? Because list_empty() serves this + * purpose. The list_empty() function fetches the RCU-protected pointer + * and compares it to the address of the list head, but neither dereferences + * this pointer itself nor provides this pointer to the caller. Therefore, + * it is not necessary to use rcu_dereference(), so that list_empty() can + * be used anywhere you would want to use a list_empty_rcu(). + */ + +/* + * return the ->next pointer of a list_head in an rcu safe + * way, we must not access it directly + */ +#define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) + +/* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know @@ -20,7 +35,7 @@ static inline void __list_add_rcu(struct list_head *new, { new->next = next; new->prev = prev; - rcu_assign_pointer(prev->next, new); + rcu_assign_pointer(list_next_rcu(prev), new); next->prev = new; } @@ -138,7 +153,7 @@ static inline void list_replace_rcu(struct list_head *old, { new->next = old->next; new->prev = old->prev; - rcu_assign_pointer(new->prev->next, new); + rcu_assign_pointer(list_next_rcu(new->prev), new); new->next->prev = new; old->prev = LIST_POISON2; } @@ -193,7 +208,7 @@ static inline void list_splice_init_rcu(struct list_head *list, */ last->next = at; - rcu_assign_pointer(head->next, first); + rcu_assign_pointer(list_next_rcu(head), first); first->prev = head; at->prev = last; } @@ -208,7 +223,9 @@ static inline void list_splice_init_rcu(struct list_head *list, * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_entry_rcu(ptr, type, member) \ - container_of(rcu_dereference_raw(ptr), type, member) + ({typeof (*ptr) __rcu *__ptr = (typeof (*ptr) __rcu __force *)ptr; \ + container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member); \ + }) /** * list_first_entry_rcu - get the first element from a list @@ -225,9 +242,9 @@ static inline void list_splice_init_rcu(struct list_head *list, list_entry_rcu((ptr)->next, type, member) #define __list_for_each_rcu(pos, head) \ - for (pos = rcu_dereference_raw((head)->next); \ + for (pos = rcu_dereference_raw(list_next_rcu(head)); \ pos != (head); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_raw(list_next_rcu((pos))) /** * list_for_each_entry_rcu - iterate over rcu list of given type @@ -257,9 +274,9 @@ static inline void list_splice_init_rcu(struct list_head *list, * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_continue_rcu(pos, head) \ - for ((pos) = rcu_dereference_raw((pos)->next); \ + for ((pos) = rcu_dereference_raw(list_next_rcu(pos)); \ prefetch((pos)->next), (pos) != (head); \ - (pos) = rcu_dereference_raw((pos)->next)) + (pos) = rcu_dereference_raw(list_next_rcu(pos))) /** * list_for_each_entry_continue_rcu - continue iteration over list of given type @@ -314,12 +331,19 @@ static inline void hlist_replace_rcu(struct hlist_node *old, new->next = next; new->pprev = old->pprev; - rcu_assign_pointer(*new->pprev, new); + rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new); if (next) new->next->pprev = &new->next; old->pprev = LIST_POISON2; } +/* + * return the first or the next element in an RCU protected hlist + */ +#define hlist_first_rcu(head) (*((struct hlist_node __rcu **)(&(head)->first))) +#define hlist_next_rcu(node) (*((struct hlist_node __rcu **)(&(node)->next))) +#define hlist_pprev_rcu(node) (*((struct hlist_node __rcu **)((node)->pprev))) + /** * hlist_add_head_rcu * @n: the element to add to the hash list. @@ -346,7 +370,7 @@ static inline void hlist_add_head_rcu(struct hlist_node *n, n->next = first; n->pprev = &h->first; - rcu_assign_pointer(h->first, n); + rcu_assign_pointer(hlist_first_rcu(h), n); if (first) first->pprev = &n->next; } @@ -374,7 +398,7 @@ static inline void hlist_add_before_rcu(struct hlist_node *n, { n->pprev = next->pprev; n->next = next; - rcu_assign_pointer(*(n->pprev), n); + rcu_assign_pointer(hlist_pprev_rcu(n), n); next->pprev = &n->next; } @@ -401,15 +425,15 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, { n->next = prev->next; n->pprev = &prev->next; - rcu_assign_pointer(prev->next, n); + rcu_assign_pointer(hlist_next_rcu(prev), n); if (n->next) n->next->pprev = &n->next; } -#define __hlist_for_each_rcu(pos, head) \ - for (pos = rcu_dereference((head)->first); \ - pos && ({ prefetch(pos->next); 1; }); \ - pos = rcu_dereference(pos->next)) +#define __hlist_for_each_rcu(pos, head) \ + for (pos = rcu_dereference(hlist_first_rcu(head)); \ + pos && ({ prefetch(pos->next); 1; }); \ + pos = rcu_dereference(hlist_next_rcu(pos))) /** * hlist_for_each_entry_rcu - iterate over rcu list of given type @@ -422,11 +446,11 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ -#define hlist_for_each_entry_rcu(tpos, pos, head, member) \ - for (pos = rcu_dereference_raw((head)->first); \ +#define hlist_for_each_entry_rcu(tpos, pos, head, member) \ + for (pos = rcu_dereference_raw(hlist_first_rcu(head)); \ pos && ({ prefetch(pos->next); 1; }) && \ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_raw(hlist_next_rcu(pos))) /** * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index b70ffe53cb9f..2ae13714828b 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -37,6 +37,12 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n) } } +#define hlist_nulls_first_rcu(head) \ + (*((struct hlist_nulls_node __rcu __force **)&(head)->first)) + +#define hlist_nulls_next_rcu(node) \ + (*((struct hlist_nulls_node __rcu __force **)&(node)->next)) + /** * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. @@ -88,7 +94,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, n->next = first; n->pprev = &h->first; - rcu_assign_pointer(h->first, n); + rcu_assign_pointer(hlist_nulls_first_rcu(h), n); if (!is_a_nulls(first)) first->pprev = &n->next; } @@ -100,11 +106,11 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, * @member: the name of the hlist_nulls_node within the struct. * */ -#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ - for (pos = rcu_dereference_raw((head)->first); \ - (!is_a_nulls(pos)) && \ +#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ + for (pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ + (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos))) #endif #endif diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 83af1f8d8b74..03cda7bed985 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -41,11 +41,15 @@ #include <linux/lockdep.h> #include <linux/completion.h> #include <linux/debugobjects.h> +#include <linux/compiler.h> #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; /* for sysctl */ #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ +#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) +#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) + /** * struct rcu_head - callback structure for use with RCU * @next: next update requests in a list @@ -57,29 +61,94 @@ struct rcu_head { }; /* Exported common interfaces */ -extern void rcu_barrier(void); +extern void call_rcu_sched(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)); +extern void synchronize_sched(void); extern void rcu_barrier_bh(void); extern void rcu_barrier_sched(void); extern void synchronize_sched_expedited(void); extern int sched_expedited_torture_stats(char *page); +static inline void __rcu_read_lock_bh(void) +{ + local_bh_disable(); +} + +static inline void __rcu_read_unlock_bh(void) +{ + local_bh_enable(); +} + +#ifdef CONFIG_PREEMPT_RCU + +extern void __rcu_read_lock(void); +extern void __rcu_read_unlock(void); +void synchronize_rcu(void); + +/* + * Defined as a macro as it is a very low level header included from + * areas that don't even know about current. This gives the rcu_read_lock() + * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other + * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. + */ +#define rcu_preempt_depth() (current->rcu_read_lock_nesting) + +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +static inline void __rcu_read_lock(void) +{ + preempt_disable(); +} + +static inline void __rcu_read_unlock(void) +{ + preempt_enable(); +} + +static inline void synchronize_rcu(void) +{ + synchronize_sched(); +} + +static inline int rcu_preempt_depth(void) +{ + return 0; +} + +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + /* Internal to kernel */ extern void rcu_init(void); +extern void rcu_sched_qs(int cpu); +extern void rcu_bh_qs(int cpu); +extern void rcu_check_callbacks(int cpu, int user); +struct notifier_block; + +#ifdef CONFIG_NO_HZ + +extern void rcu_enter_nohz(void); +extern void rcu_exit_nohz(void); + +#else /* #ifdef CONFIG_NO_HZ */ + +static inline void rcu_enter_nohz(void) +{ +} + +static inline void rcu_exit_nohz(void) +{ +} + +#endif /* #else #ifdef CONFIG_NO_HZ */ #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) #include <linux/rcutree.h> -#elif defined(CONFIG_TINY_RCU) +#elif defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) #include <linux/rcutiny.h> #else #error "Unknown RCU implementation specified to kernel configuration" #endif -#define RCU_HEAD_INIT { .next = NULL, .func = NULL } -#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT -#define INIT_RCU_HEAD(ptr) do { \ - (ptr)->next = NULL; (ptr)->func = NULL; \ -} while (0) - /* * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic * initialization and destruction of rcu_head on the stack. rcu_head structures @@ -120,14 +189,15 @@ extern struct lockdep_map rcu_sched_lock_map; extern int debug_lockdep_rcu_enabled(void); /** - * rcu_read_lock_held - might we be in RCU read-side critical section? + * rcu_read_lock_held() - might we be in RCU read-side critical section? * * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, * this assumes we are in an RCU read-side critical section unless it can - * prove otherwise. + * prove otherwise. This is useful for debug checks in functions that + * require that they be called within an RCU read-side critical section. * - * Check debug_lockdep_rcu_enabled() to prevent false positives during boot + * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. */ static inline int rcu_read_lock_held(void) @@ -144,14 +214,16 @@ static inline int rcu_read_lock_held(void) extern int rcu_read_lock_bh_held(void); /** - * rcu_read_lock_sched_held - might we be in RCU-sched read-side critical section? + * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? * * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an * RCU-sched read-side critical section. In absence of * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side * critical section unless it can prove otherwise. Note that disabling * of preemption (including disabling irqs) counts as an RCU-sched - * read-side critical section. + * read-side critical section. This is useful for debug checks in functions + * that required that they be called within an RCU-sched read-side + * critical section. * * Check debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. @@ -211,7 +283,11 @@ static inline int rcu_read_lock_sched_held(void) extern int rcu_my_thread_group_empty(void); -#define __do_rcu_dereference_check(c) \ +/** + * rcu_lockdep_assert - emit lockdep splat if specified condition not met + * @c: condition to check + */ +#define rcu_lockdep_assert(c) \ do { \ static bool __warned; \ if (debug_lockdep_rcu_enabled() && !__warned && !(c)) { \ @@ -220,41 +296,163 @@ extern int rcu_my_thread_group_empty(void); } \ } while (0) +#else /* #ifdef CONFIG_PROVE_RCU */ + +#define rcu_lockdep_assert(c) do { } while (0) + +#endif /* #else #ifdef CONFIG_PROVE_RCU */ + +/* + * Helper functions for rcu_dereference_check(), rcu_dereference_protected() + * and rcu_assign_pointer(). Some of these could be folded into their + * callers, but they are left separate in order to ease introduction of + * multiple flavors of pointers to match the multiple flavors of RCU + * (e.g., __rcu_bh, * __rcu_sched, and __srcu), should this make sense in + * the future. + */ + +#ifdef __CHECKER__ +#define rcu_dereference_sparse(p, space) \ + ((void)(((typeof(*p) space *)p) == p)) +#else /* #ifdef __CHECKER__ */ +#define rcu_dereference_sparse(p, space) +#endif /* #else #ifdef __CHECKER__ */ + +#define __rcu_access_pointer(p, space) \ + ({ \ + typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \ + rcu_dereference_sparse(p, space); \ + ((typeof(*p) __force __kernel *)(_________p1)); \ + }) +#define __rcu_dereference_check(p, c, space) \ + ({ \ + typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \ + rcu_lockdep_assert(c); \ + rcu_dereference_sparse(p, space); \ + smp_read_barrier_depends(); \ + ((typeof(*p) __force __kernel *)(_________p1)); \ + }) +#define __rcu_dereference_protected(p, c, space) \ + ({ \ + rcu_lockdep_assert(c); \ + rcu_dereference_sparse(p, space); \ + ((typeof(*p) __force __kernel *)(p)); \ + }) + +#define __rcu_dereference_index_check(p, c) \ + ({ \ + typeof(p) _________p1 = ACCESS_ONCE(p); \ + rcu_lockdep_assert(c); \ + smp_read_barrier_depends(); \ + (_________p1); \ + }) +#define __rcu_assign_pointer(p, v, space) \ + ({ \ + if (!__builtin_constant_p(v) || \ + ((v) != NULL)) \ + smp_wmb(); \ + (p) = (typeof(*v) __force space *)(v); \ + }) + + +/** + * rcu_access_pointer() - fetch RCU pointer with no dereferencing + * @p: The pointer to read + * + * Return the value of the specified RCU-protected pointer, but omit the + * smp_read_barrier_depends() and keep the ACCESS_ONCE(). This is useful + * when the value of this pointer is accessed, but the pointer is not + * dereferenced, for example, when testing an RCU-protected pointer against + * NULL. Although rcu_access_pointer() may also be used in cases where + * update-side locks prevent the value of the pointer from changing, you + * should instead use rcu_dereference_protected() for this use case. + */ +#define rcu_access_pointer(p) __rcu_access_pointer((p), __rcu) + /** - * rcu_dereference_check - rcu_dereference with debug checking + * rcu_dereference_check() - rcu_dereference with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Do an rcu_dereference(), but check that the conditions under which the - * dereference will take place are correct. Typically the conditions indicate - * the various locking conditions that should be held at that point. The check - * should return true if the conditions are satisfied. + * dereference will take place are correct. Typically the conditions + * indicate the various locking conditions that should be held at that + * point. The check should return true if the conditions are satisfied. + * An implicit check for being in an RCU read-side critical section + * (rcu_read_lock()) is included. * * For example: * - * bar = rcu_dereference_check(foo->bar, rcu_read_lock_held() || - * lockdep_is_held(&foo->lock)); + * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock)); * * could be used to indicate to lockdep that foo->bar may only be dereferenced - * if either the RCU read lock is held, or that the lock required to replace + * if either rcu_read_lock() is held, or that the lock required to replace * the bar struct at foo->bar is held. * * Note that the list of conditions may also include indications of when a lock * need not be held, for example during initialisation or destruction of the * target struct: * - * bar = rcu_dereference_check(foo->bar, rcu_read_lock_held() || - * lockdep_is_held(&foo->lock) || + * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) || * atomic_read(&foo->usage) == 0); + * + * Inserts memory barriers on architectures that require them + * (currently only the Alpha), prevents the compiler from refetching + * (and from merging fetches), and, more importantly, documents exactly + * which pointers are protected by RCU and checks that the pointer is + * annotated as __rcu. */ #define rcu_dereference_check(p, c) \ - ({ \ - __do_rcu_dereference_check(c); \ - rcu_dereference_raw(p); \ - }) + __rcu_dereference_check((p), rcu_read_lock_held() || (c), __rcu) + +/** + * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place + * + * This is the RCU-bh counterpart to rcu_dereference_check(). + */ +#define rcu_dereference_bh_check(p, c) \ + __rcu_dereference_check((p), rcu_read_lock_bh_held() || (c), __rcu) /** - * rcu_dereference_protected - fetch RCU pointer when updates prevented + * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place + * + * This is the RCU-sched counterpart to rcu_dereference_check(). + */ +#define rcu_dereference_sched_check(p, c) \ + __rcu_dereference_check((p), rcu_read_lock_sched_held() || (c), \ + __rcu) + +#define rcu_dereference_raw(p) rcu_dereference_check(p, 1) /*@@@ needed? @@@*/ + +/** + * rcu_dereference_index_check() - rcu_dereference for indices with debug checking + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place + * + * Similar to rcu_dereference_check(), but omits the sparse checking. + * This allows rcu_dereference_index_check() to be used on integers, + * which can then be used as array indices. Attempting to use + * rcu_dereference_check() on an integer will give compiler warnings + * because the sparse address-space mechanism relies on dereferencing + * the RCU-protected pointer. Dereferencing integers is not something + * that even gcc will put up with. + * + * Note that this function does not implicitly check for RCU read-side + * critical sections. If this function gains lots of uses, it might + * make sense to provide versions for each flavor of RCU, but it does + * not make sense as of early 2010. + */ +#define rcu_dereference_index_check(p, c) \ + __rcu_dereference_index_check((p), (c)) + +/** + * rcu_dereference_protected() - fetch RCU pointer when updates prevented + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place * * Return the value of the specified RCU-protected pointer, but omit * both the smp_read_barrier_depends() and the ACCESS_ONCE(). This @@ -263,35 +461,61 @@ extern int rcu_my_thread_group_empty(void); * prevent the compiler from repeating this reference or combining it * with other references, so it should not be used without protection * of appropriate locks. + * + * This function is only for update-side use. Using this function + * when protected only by rcu_read_lock() will result in infrequent + * but very ugly failures. */ #define rcu_dereference_protected(p, c) \ - ({ \ - __do_rcu_dereference_check(c); \ - (p); \ - }) + __rcu_dereference_protected((p), (c), __rcu) -#else /* #ifdef CONFIG_PROVE_RCU */ +/** + * rcu_dereference_bh_protected() - fetch RCU-bh pointer when updates prevented + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place + * + * This is the RCU-bh counterpart to rcu_dereference_protected(). + */ +#define rcu_dereference_bh_protected(p, c) \ + __rcu_dereference_protected((p), (c), __rcu) -#define rcu_dereference_check(p, c) rcu_dereference_raw(p) -#define rcu_dereference_protected(p, c) (p) +/** + * rcu_dereference_sched_protected() - fetch RCU-sched pointer when updates prevented + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place + * + * This is the RCU-sched counterpart to rcu_dereference_protected(). + */ +#define rcu_dereference_sched_protected(p, c) \ + __rcu_dereference_protected((p), (c), __rcu) -#endif /* #else #ifdef CONFIG_PROVE_RCU */ /** - * rcu_access_pointer - fetch RCU pointer with no dereferencing + * rcu_dereference() - fetch RCU-protected pointer for dereferencing + * @p: The pointer to read, prior to dereferencing * - * Return the value of the specified RCU-protected pointer, but omit the - * smp_read_barrier_depends() and keep the ACCESS_ONCE(). This is useful - * when the value of this pointer is accessed, but the pointer is not - * dereferenced, for example, when testing an RCU-protected pointer against - * NULL. This may also be used in cases where update-side locks prevent - * the value of the pointer from changing, but rcu_dereference_protected() - * is a lighter-weight primitive for this use case. + * This is a simple wrapper around rcu_dereference_check(). + */ +#define rcu_dereference(p) rcu_dereference_check(p, 0) + +/** + * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing + * @p: The pointer to read, prior to dereferencing + * + * Makes rcu_dereference_check() do the dirty work. + */ +#define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0) + +/** + * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing + * @p: The pointer to read, prior to dereferencing + * + * Makes rcu_dereference_check() do the dirty work. */ -#define rcu_access_pointer(p) ACCESS_ONCE(p) +#define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0) /** - * rcu_read_lock - mark the beginning of an RCU read-side critical section. + * rcu_read_lock() - mark the beginning of an RCU read-side critical section * * When synchronize_rcu() is invoked on one CPU while other CPUs * are within RCU read-side critical sections, then the @@ -302,7 +526,7 @@ extern int rcu_my_thread_group_empty(void); * until after the all the other CPUs exit their critical sections. * * Note, however, that RCU callbacks are permitted to run concurrently - * with RCU read-side critical sections. One way that this can happen + * with new RCU read-side critical sections. One way that this can happen * is via the following sequence of events: (1) CPU 0 enters an RCU * read-side critical section, (2) CPU 1 invokes call_rcu() to register * an RCU callback, (3) CPU 0 exits the RCU read-side critical section, @@ -317,7 +541,20 @@ extern int rcu_my_thread_group_empty(void); * will be deferred until the outermost RCU read-side critical section * completes. * - * It is illegal to block while in an RCU read-side critical section. + * You can avoid reading and understanding the next paragraph by + * following this rule: don't put anything in an rcu_read_lock() RCU + * read-side critical section that would block in a !PREEMPT kernel. + * But if you want the full story, read on! + * + * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), it + * is illegal to block while in an RCU read-side critical section. In + * preemptible RCU implementations (TREE_PREEMPT_RCU and TINY_PREEMPT_RCU) + * in CONFIG_PREEMPT kernel builds, RCU read-side critical sections may + * be preempted, but explicit blocking is illegal. Finally, in preemptible + * RCU implementations in real-time (CONFIG_PREEMPT_RT) kernel builds, + * RCU read-side critical sections may be preempted and they may also + * block, but only when acquiring spinlocks that are subject to priority + * inheritance. */ static inline void rcu_read_lock(void) { @@ -337,7 +574,7 @@ static inline void rcu_read_lock(void) */ /** - * rcu_read_unlock - marks the end of an RCU read-side critical section. + * rcu_read_unlock() - marks the end of an RCU read-side critical section. * * See rcu_read_lock() for more information. */ @@ -349,15 +586,16 @@ static inline void rcu_read_unlock(void) } /** - * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section + * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section * * This is equivalent of rcu_read_lock(), but to be used when updates - * are being done using call_rcu_bh(). Since call_rcu_bh() callbacks - * consider completion of a softirq handler to be a quiescent state, - * a process in RCU read-side critical section must be protected by - * disabling softirqs. Read-side critical sections in interrupt context - * can use just rcu_read_lock(). - * + * are being done using call_rcu_bh() or synchronize_rcu_bh(). Since + * both call_rcu_bh() and synchronize_rcu_bh() consider completion of a + * softirq handler to be a quiescent state, a process in RCU read-side + * critical section must be protected by disabling softirqs. Read-side + * critical sections in interrupt context can use just rcu_read_lock(), + * though this should at least be commented to avoid confusing people + * reading the code. */ static inline void rcu_read_lock_bh(void) { @@ -379,13 +617,12 @@ static inline void rcu_read_unlock_bh(void) } /** - * rcu_read_lock_sched - mark the beginning of a RCU-classic critical section + * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section * - * Should be used with either - * - synchronize_sched() - * or - * - call_rcu_sched() and rcu_barrier_sched() - * on the write-side to insure proper synchronization. + * This is equivalent of rcu_read_lock(), but to be used when updates + * are being done using call_rcu_sched() or synchronize_rcu_sched(). + * Read-side critical sections can also be introduced by anything that + * disables preemption, including local_irq_disable() and friends. */ static inline void rcu_read_lock_sched(void) { @@ -420,54 +657,14 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) preempt_enable_notrace(); } - /** - * rcu_dereference_raw - fetch an RCU-protected pointer + * rcu_assign_pointer() - assign to RCU-protected pointer + * @p: pointer to assign to + * @v: value to assign (publish) * - * The caller must be within some flavor of RCU read-side critical - * section, or must be otherwise preventing the pointer from changing, - * for example, by holding an appropriate lock. This pointer may later - * be safely dereferenced. It is the caller's responsibility to have - * done the right thing, as this primitive does no checking of any kind. - * - * Inserts memory barriers on architectures that require them - * (currently only the Alpha), and, more importantly, documents - * exactly which pointers are protected by RCU. - */ -#define rcu_dereference_raw(p) ({ \ - typeof(p) _________p1 = ACCESS_ONCE(p); \ - smp_read_barrier_depends(); \ - (_________p1); \ - }) - -/** - * rcu_dereference - fetch an RCU-protected pointer, checking for RCU - * - * Makes rcu_dereference_check() do the dirty work. - */ -#define rcu_dereference(p) \ - rcu_dereference_check(p, rcu_read_lock_held()) - -/** - * rcu_dereference_bh - fetch an RCU-protected pointer, checking for RCU-bh - * - * Makes rcu_dereference_check() do the dirty work. - */ -#define rcu_dereference_bh(p) \ - rcu_dereference_check(p, rcu_read_lock_bh_held() || irqs_disabled()) - -/** - * rcu_dereference_sched - fetch RCU-protected pointer, checking for RCU-sched - * - * Makes rcu_dereference_check() do the dirty work. - */ -#define rcu_dereference_sched(p) \ - rcu_dereference_check(p, rcu_read_lock_sched_held()) - -/** - * rcu_assign_pointer - assign (publicize) a pointer to a newly - * initialized structure that will be dereferenced by RCU read-side - * critical sections. Returns the value assigned. + * Assigns the specified value to the specified RCU-protected + * pointer, ensuring that any concurrent RCU readers will see + * any prior initialization. Returns the value assigned. * * Inserts memory barriers on architectures that require them * (pretty much all of them other than x86), and also prevents @@ -476,14 +673,17 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * call documents which pointers will be dereferenced by RCU read-side * code. */ - #define rcu_assign_pointer(p, v) \ - ({ \ - if (!__builtin_constant_p(v) || \ - ((v) != NULL)) \ - smp_wmb(); \ - (p) = (v); \ - }) + __rcu_assign_pointer((p), (v), __rcu) + +/** + * RCU_INIT_POINTER() - initialize an RCU protected pointer + * + * Initialize an RCU-protected pointer in such a way to avoid RCU-lockdep + * splats. + */ +#define RCU_INIT_POINTER(p, v) \ + p = (typeof(*v) __force __rcu *)(v) /* Infrastructure to implement the synchronize_() primitives. */ @@ -494,26 +694,37 @@ struct rcu_synchronize { extern void wakeme_after_rcu(struct rcu_head *head); +#ifdef CONFIG_PREEMPT_RCU + /** - * call_rcu - Queue an RCU callback for invocation after a grace period. + * call_rcu() - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period + * @func: actual callback function to be invoked after the grace period * - * The update function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all pre-existing RCU read-side + * critical sections have completed. However, the callback function + * might well execute concurrently with RCU read-side critical sections + * that started after call_rcu() was invoked. RCU read-side critical * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. */ extern void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *head)); +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +/* In classic RCU, call_rcu() is just call_rcu_sched(). */ +#define call_rcu call_rcu_sched + +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + /** - * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. + * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period. * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period + * @func: actual callback function to be invoked after the grace period * - * The update function will be invoked some time after a full grace + * The callback function will be invoked some time after a full grace * period elapses, in other words after all currently executing RCU * read-side critical sections have completed. call_rcu_bh() assumes * that the read-side critical sections end on completion of a softirq @@ -566,37 +777,4 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -#ifndef CONFIG_PROVE_RCU -#define __do_rcu_dereference_check(c) do { } while (0) -#endif /* #ifdef CONFIG_PROVE_RCU */ - -#define __rcu_dereference_index_check(p, c) \ - ({ \ - typeof(p) _________p1 = ACCESS_ONCE(p); \ - __do_rcu_dereference_check(c); \ - smp_read_barrier_depends(); \ - (_________p1); \ - }) - -/** - * rcu_dereference_index_check() - rcu_dereference for indices with debug checking - * @p: The pointer to read, prior to dereferencing - * @c: The conditions under which the dereference will take place - * - * Similar to rcu_dereference_check(), but omits the sparse checking. - * This allows rcu_dereference_index_check() to be used on integers, - * which can then be used as array indices. Attempting to use - * rcu_dereference_check() on an integer will give compiler warnings - * because the sparse address-space mechanism relies on dereferencing - * the RCU-protected pointer. Dereferencing integers is not something - * that even gcc will put up with. - * - * Note that this function does not implicitly check for RCU read-side - * critical sections. If this function gains lots of uses, it might - * make sense to provide versions for each flavor of RCU, but it does - * not make sense as of early 2010. - */ -#define rcu_dereference_index_check(p, c) \ - __rcu_dereference_index_check((p), (c)) - #endif /* __LINUX_RCUPDATE_H */ diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index e2e893144a84..13877cb93a60 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -27,103 +27,101 @@ #include <linux/cache.h> -void rcu_sched_qs(int cpu); -void rcu_bh_qs(int cpu); -static inline void rcu_note_context_switch(int cpu) -{ - rcu_sched_qs(cpu); -} +#define rcu_init_sched() do { } while (0) -#define __rcu_read_lock() preempt_disable() -#define __rcu_read_unlock() preempt_enable() -#define __rcu_read_lock_bh() local_bh_disable() -#define __rcu_read_unlock_bh() local_bh_enable() -#define call_rcu_sched call_rcu +#ifdef CONFIG_TINY_RCU -#define rcu_init_sched() do { } while (0) -extern void rcu_check_callbacks(int cpu, int user); +static inline void synchronize_rcu_expedited(void) +{ + synchronize_sched(); /* Only one CPU, so pretty fast anyway!!! */ +} -static inline int rcu_needs_cpu(int cpu) +static inline void rcu_barrier(void) { - return 0; + rcu_barrier_sched(); /* Only one CPU, so only one list of callbacks! */ } -/* - * Return the number of grace periods. - */ -static inline long rcu_batches_completed(void) +#else /* #ifdef CONFIG_TINY_RCU */ + +void rcu_barrier(void); +void synchronize_rcu_expedited(void); + +#endif /* #else #ifdef CONFIG_TINY_RCU */ + +static inline void synchronize_rcu_bh(void) { - return 0; + synchronize_sched(); } -/* - * Return the number of bottom-half grace periods. - */ -static inline long rcu_batches_completed_bh(void) +static inline void synchronize_rcu_bh_expedited(void) { - return 0; + synchronize_sched(); } -static inline void rcu_force_quiescent_state(void) +#ifdef CONFIG_TINY_RCU + +static inline void rcu_preempt_note_context_switch(void) { } -static inline void rcu_bh_force_quiescent_state(void) +static inline void exit_rcu(void) { } -static inline void rcu_sched_force_quiescent_state(void) +static inline int rcu_needs_cpu(int cpu) { + return 0; } -extern void synchronize_sched(void); +#else /* #ifdef CONFIG_TINY_RCU */ + +void rcu_preempt_note_context_switch(void); +extern void exit_rcu(void); +int rcu_preempt_needs_cpu(void); -static inline void synchronize_rcu(void) +static inline int rcu_needs_cpu(int cpu) { - synchronize_sched(); + return rcu_preempt_needs_cpu(); } -static inline void synchronize_rcu_bh(void) +#endif /* #else #ifdef CONFIG_TINY_RCU */ + +static inline void rcu_note_context_switch(int cpu) { - synchronize_sched(); + rcu_sched_qs(cpu); + rcu_preempt_note_context_switch(); } -static inline void synchronize_rcu_expedited(void) +/* + * Return the number of grace periods. + */ +static inline long rcu_batches_completed(void) { - synchronize_sched(); + return 0; } -static inline void synchronize_rcu_bh_expedited(void) +/* + * Return the number of bottom-half grace periods. + */ +static inline long rcu_batches_completed_bh(void) { - synchronize_sched(); + return 0; } -struct notifier_block; - -#ifdef CONFIG_NO_HZ - -extern void rcu_enter_nohz(void); -extern void rcu_exit_nohz(void); - -#else /* #ifdef CONFIG_NO_HZ */ - -static inline void rcu_enter_nohz(void) +static inline void rcu_force_quiescent_state(void) { } -static inline void rcu_exit_nohz(void) +static inline void rcu_bh_force_quiescent_state(void) { } -#endif /* #else #ifdef CONFIG_NO_HZ */ - -static inline void exit_rcu(void) +static inline void rcu_sched_force_quiescent_state(void) { } -static inline int rcu_preempt_depth(void) +static inline void rcu_cpu_stall_reset(void) { - return 0; } #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index c0ed1c056f29..95518e628794 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -30,64 +30,23 @@ #ifndef __LINUX_RCUTREE_H #define __LINUX_RCUTREE_H -struct notifier_block; - -extern void rcu_sched_qs(int cpu); -extern void rcu_bh_qs(int cpu); extern void rcu_note_context_switch(int cpu); extern int rcu_needs_cpu(int cpu); +extern void rcu_cpu_stall_reset(void); #ifdef CONFIG_TREE_PREEMPT_RCU -extern void __rcu_read_lock(void); -extern void __rcu_read_unlock(void); -extern void synchronize_rcu(void); extern void exit_rcu(void); -/* - * Defined as macro as it is a very low level header - * included from areas that don't even know about current - */ -#define rcu_preempt_depth() (current->rcu_read_lock_nesting) - #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -static inline void __rcu_read_lock(void) -{ - preempt_disable(); -} - -static inline void __rcu_read_unlock(void) -{ - preempt_enable(); -} - -#define synchronize_rcu synchronize_sched - static inline void exit_rcu(void) { } -static inline int rcu_preempt_depth(void) -{ - return 0; -} - #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ -static inline void __rcu_read_lock_bh(void) -{ - local_bh_disable(); -} -static inline void __rcu_read_unlock_bh(void) -{ - local_bh_enable(); -} - -extern void call_rcu_sched(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)); extern void synchronize_rcu_bh(void); -extern void synchronize_sched(void); extern void synchronize_rcu_expedited(void); static inline void synchronize_rcu_bh_expedited(void) @@ -95,7 +54,7 @@ static inline void synchronize_rcu_bh_expedited(void) synchronize_sched_expedited(); } -extern void rcu_check_callbacks(int cpu, int user); +extern void rcu_barrier(void); extern long rcu_batches_completed(void); extern long rcu_batches_completed_bh(void); @@ -104,18 +63,6 @@ extern void rcu_force_quiescent_state(void); extern void rcu_bh_force_quiescent_state(void); extern void rcu_sched_force_quiescent_state(void); -#ifdef CONFIG_NO_HZ -void rcu_enter_nohz(void); -void rcu_exit_nohz(void); -#else /* CONFIG_NO_HZ */ -static inline void rcu_enter_nohz(void) -{ -} -static inline void rcu_exit_nohz(void) -{ -} -#endif /* CONFIG_NO_HZ */ - /* A context switch is a grace period for RCU-sched and RCU-bh. */ static inline int rcu_blocking_is_gp(void) { diff --git a/include/linux/sched.h b/include/linux/sched.h index eb3c1ceec06e..61b4ecf1da50 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1209,11 +1209,13 @@ struct task_struct { unsigned int policy; cpumask_t cpus_allowed; -#ifdef CONFIG_TREE_PREEMPT_RCU +#ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; char rcu_read_unlock_special; - struct rcu_node *rcu_blocked_node; struct list_head rcu_node_entry; +#endif /* #ifdef CONFIG_PREEMPT_RCU */ +#ifdef CONFIG_TREE_PREEMPT_RCU + struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) @@ -1295,9 +1297,9 @@ struct task_struct { struct list_head cpu_timers[3]; /* process credentials */ - const struct cred *real_cred; /* objective and real subjective task + const struct cred __rcu *real_cred; /* objective and real subjective task * credentials (COW) */ - const struct cred *cred; /* effective (overridable) subjective task + const struct cred __rcu *cred; /* effective (overridable) subjective task * credentials (COW) */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations @@ -1425,7 +1427,7 @@ struct task_struct { #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock */ - struct css_set *cgroups; + struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock */ struct list_head cg_list; #endif @@ -1747,7 +1749,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) -#ifdef CONFIG_TREE_PREEMPT_RCU +#ifdef CONFIG_PREEMPT_RCU #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ @@ -1756,7 +1758,9 @@ static inline void rcu_copy_process(struct task_struct *p) { p->rcu_read_lock_nesting = 0; p->rcu_read_unlock_special = 0; +#ifdef CONFIG_TREE_PREEMPT_RCU p->rcu_blocked_node = NULL; +#endif INIT_LIST_HEAD(&p->rcu_node_entry); } diff --git a/include/linux/security.h b/include/linux/security.h index a22219afff09..b8246a8df7d2 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -74,7 +74,7 @@ extern int cap_file_mmap(struct file *file, unsigned long reqprot, extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags); extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5); -extern int cap_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp); +extern int cap_task_setscheduler(struct task_struct *p); extern int cap_task_setioprio(struct task_struct *p, int ioprio); extern int cap_task_setnice(struct task_struct *p, int nice); extern int cap_syslog(int type, bool from_file); @@ -959,6 +959,12 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Sets the new child socket's sid to the openreq sid. * @inet_conn_established: * Sets the connection's peersid to the secmark on skb. + * @secmark_relabel_packet: + * check if the process should be allowed to relabel packets to the given secid + * @security_secmark_refcount_inc + * tells the LSM to increment the number of secmark labeling rules loaded + * @security_secmark_refcount_dec + * tells the LSM to decrement the number of secmark labeling rules loaded * @req_classify_flow: * Sets the flow's sid to the openreq sid. * @tun_dev_create: @@ -1279,9 +1285,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Return 0 if permission is granted. * * @secid_to_secctx: - * Convert secid to security context. + * Convert secid to security context. If secdata is NULL the length of + * the result will be returned in seclen, but no secdata will be returned. + * This does mean that the length could change between calls to check the + * length and the next call which actually allocates and returns the secdata. * @secid contains the security ID. * @secdata contains the pointer that stores the converted security context. + * @seclen pointer which contains the length of the data * @secctx_to_secid: * Convert security context to secid. * @secid contains the pointer to the generated security ID. @@ -1501,8 +1511,7 @@ struct security_operations { int (*task_getioprio) (struct task_struct *p); int (*task_setrlimit) (struct task_struct *p, unsigned int resource, struct rlimit *new_rlim); - int (*task_setscheduler) (struct task_struct *p, int policy, - struct sched_param *lp); + int (*task_setscheduler) (struct task_struct *p); int (*task_getscheduler) (struct task_struct *p); int (*task_movememory) (struct task_struct *p); int (*task_kill) (struct task_struct *p, @@ -1594,6 +1603,9 @@ struct security_operations { struct request_sock *req); void (*inet_csk_clone) (struct sock *newsk, const struct request_sock *req); void (*inet_conn_established) (struct sock *sk, struct sk_buff *skb); + int (*secmark_relabel_packet) (u32 secid); + void (*secmark_refcount_inc) (void); + void (*secmark_refcount_dec) (void); void (*req_classify_flow) (const struct request_sock *req, struct flowi *fl); int (*tun_dev_create)(void); void (*tun_dev_post_create)(struct sock *sk); @@ -1752,8 +1764,7 @@ int security_task_setioprio(struct task_struct *p, int ioprio); int security_task_getioprio(struct task_struct *p); int security_task_setrlimit(struct task_struct *p, unsigned int resource, struct rlimit *new_rlim); -int security_task_setscheduler(struct task_struct *p, - int policy, struct sched_param *lp); +int security_task_setscheduler(struct task_struct *p); int security_task_getscheduler(struct task_struct *p); int security_task_movememory(struct task_struct *p); int security_task_kill(struct task_struct *p, struct siginfo *info, @@ -2320,11 +2331,9 @@ static inline int security_task_setrlimit(struct task_struct *p, return 0; } -static inline int security_task_setscheduler(struct task_struct *p, - int policy, - struct sched_param *lp) +static inline int security_task_setscheduler(struct task_struct *p) { - return cap_task_setscheduler(p, policy, lp); + return cap_task_setscheduler(p); } static inline int security_task_getscheduler(struct task_struct *p) @@ -2551,6 +2560,9 @@ void security_inet_csk_clone(struct sock *newsk, const struct request_sock *req); void security_inet_conn_established(struct sock *sk, struct sk_buff *skb); +int security_secmark_relabel_packet(u32 secid); +void security_secmark_refcount_inc(void); +void security_secmark_refcount_dec(void); int security_tun_dev_create(void); void security_tun_dev_post_create(struct sock *sk); int security_tun_dev_attach(struct sock *sk); @@ -2705,6 +2717,19 @@ static inline void security_inet_conn_established(struct sock *sk, { } +static inline int security_secmark_relabel_packet(u32 secid) +{ + return 0; +} + +static inline void security_secmark_refcount_inc(void) +{ +} + +static inline void security_secmark_refcount_dec(void) +{ +} + static inline int security_tun_dev_create(void) { return 0; diff --git a/include/linux/selinux.h b/include/linux/selinux.h index 82e0f26a1299..44f459612690 100644 --- a/include/linux/selinux.h +++ b/include/linux/selinux.h @@ -21,74 +21,11 @@ struct kern_ipc_perm; #ifdef CONFIG_SECURITY_SELINUX /** - * selinux_string_to_sid - map a security context string to a security ID - * @str: the security context string to be mapped - * @sid: ID value returned via this. - * - * Returns 0 if successful, with the SID stored in sid. A value - * of zero for sid indicates no SID could be determined (but no error - * occurred). - */ -int selinux_string_to_sid(char *str, u32 *sid); - -/** - * selinux_secmark_relabel_packet_permission - secmark permission check - * @sid: SECMARK ID value to be applied to network packet - * - * Returns 0 if the current task is allowed to set the SECMARK label of - * packets with the supplied security ID. Note that it is implicit that - * the packet is always being relabeled from the default unlabeled value, - * and that the access control decision is made in the AVC. - */ -int selinux_secmark_relabel_packet_permission(u32 sid); - -/** - * selinux_secmark_refcount_inc - increments the secmark use counter - * - * SELinux keeps track of the current SECMARK targets in use so it knows - * when to apply SECMARK label access checks to network packets. This - * function incements this reference count to indicate that a new SECMARK - * target has been configured. - */ -void selinux_secmark_refcount_inc(void); - -/** - * selinux_secmark_refcount_dec - decrements the secmark use counter - * - * SELinux keeps track of the current SECMARK targets in use so it knows - * when to apply SECMARK label access checks to network packets. This - * function decements this reference count to indicate that one of the - * existing SECMARK targets has been removed/flushed. - */ -void selinux_secmark_refcount_dec(void); - -/** * selinux_is_enabled - is SELinux enabled? */ bool selinux_is_enabled(void); #else -static inline int selinux_string_to_sid(const char *str, u32 *sid) -{ - *sid = 0; - return 0; -} - -static inline int selinux_secmark_relabel_packet_permission(u32 sid) -{ - return 0; -} - -static inline void selinux_secmark_refcount_inc(void) -{ - return; -} - -static inline void selinux_secmark_refcount_dec(void) -{ - return; -} - static inline bool selinux_is_enabled(void) { return false; diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 4d5d2f546dbf..58971e891f48 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -108,19 +108,43 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /** - * srcu_dereference - fetch SRCU-protected pointer with checking + * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing + * @p: the pointer to fetch and protect for later dereferencing + * @sp: pointer to the srcu_struct, which is used to check that we + * really are in an SRCU read-side critical section. + * @c: condition to check for update-side use * - * Makes rcu_dereference_check() do the dirty work. + * If PROVE_RCU is enabled, invoking this outside of an RCU read-side + * critical section will result in an RCU-lockdep splat, unless @c evaluates + * to 1. The @c argument will normally be a logical expression containing + * lockdep_is_held() calls. */ -#define srcu_dereference(p, sp) \ - rcu_dereference_check(p, srcu_read_lock_held(sp)) +#define srcu_dereference_check(p, sp, c) \ + __rcu_dereference_check((p), srcu_read_lock_held(sp) || (c), __rcu) + +/** + * srcu_dereference - fetch SRCU-protected pointer for later dereferencing + * @p: the pointer to fetch and protect for later dereferencing + * @sp: pointer to the srcu_struct, which is used to check that we + * really are in an SRCU read-side critical section. + * + * Makes rcu_dereference_check() do the dirty work. If PROVE_RCU + * is enabled, invoking this outside of an RCU read-side critical + * section will result in an RCU-lockdep splat. + */ +#define srcu_dereference(p, sp) srcu_dereference_check((p), (sp), 0) /** * srcu_read_lock - register a new reader for an SRCU-protected structure. * @sp: srcu_struct in which to register the new reader. * * Enter an SRCU read-side critical section. Note that SRCU read-side - * critical sections may be nested. + * critical sections may be nested. However, it is illegal to + * call anything that waits on an SRCU grace period for the same + * srcu_struct, whether directly or indirectly. Please note that + * one way to indirectly wait on an SRCU grace period is to acquire + * a mutex that is held elsewhere while calling synchronize_srcu() or + * synchronize_srcu_expedited(). */ static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) { diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h index 671538d25bc1..8eee9dbbfe7a 100644 --- a/include/linux/sunrpc/auth_gss.h +++ b/include/linux/sunrpc/auth_gss.h @@ -69,7 +69,7 @@ struct gss_cl_ctx { enum rpc_gss_proc gc_proc; u32 gc_seq; spinlock_t gc_seq_lock; - struct gss_ctx *gc_gss_ctx; + struct gss_ctx __rcu *gc_gss_ctx; struct xdr_netobj gc_wire_ctx; u32 gc_win; unsigned long gc_expiry; @@ -80,7 +80,7 @@ struct gss_upcall_msg; struct gss_cred { struct rpc_cred gc_base; enum rpc_gss_svc gc_service; - struct gss_cl_ctx *gc_ctx; + struct gss_cl_ctx __rcu *gc_ctx; struct gss_upcall_msg *gc_upcall; unsigned long gc_upcall_timestamp; unsigned char gc_machine_cred : 1; diff --git a/include/linux/types.h b/include/linux/types.h index 01a082f56ef4..357dbc19606f 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -121,7 +121,15 @@ typedef __u64 u_int64_t; typedef __s64 int64_t; #endif -/* this is a special 64bit data type that is 8-byte aligned */ +/* + * aligned_u64 should be used in defining kernel<->userspace ABIs to avoid + * common 32/64-bit compat problems. + * 64-bit values align to 4-byte boundaries on x86_32 (and possibly other + * architectures) and to 8-byte boundaries on 64-bit architetures. The new + * aligned_64 type enforces 8-byte alignment so that structs containing + * aligned_64 values have the same alignment on 32-bit and 64-bit architectures. + * No conversions are necessary between 32-bit user-space and a 64-bit kernel. + */ #define aligned_u64 __u64 __attribute__((aligned(8))) #define aligned_be64 __be64 __attribute__((aligned(8))) #define aligned_le64 __le64 __attribute__((aligned(8))) @@ -178,6 +186,11 @@ typedef __u64 __bitwise __be64; typedef __u16 __bitwise __sum16; typedef __u32 __bitwise __wsum; +/* this is a special 64bit data type that is 8-byte aligned */ +#define __aligned_u64 __u64 __attribute__((aligned(8))) +#define __aligned_be64 __be64 __attribute__((aligned(8))) +#define __aligned_le64 __le64 __attribute__((aligned(8))) + #ifdef __KERNEL__ typedef unsigned __bitwise__ gfp_t; typedef unsigned __bitwise__ fmode_t; diff --git a/include/media/videobuf-dma-sg.h b/include/media/videobuf-dma-sg.h index 97e07f46a0fa..aa4ebb42a565 100644 --- a/include/media/videobuf-dma-sg.h +++ b/include/media/videobuf-dma-sg.h @@ -48,6 +48,7 @@ struct videobuf_dmabuf { /* for userland buffer */ int offset; + size_t size; struct page **pages; /* for kernel buffers */ diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 27a902d9b3a9..30fce0128dd7 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -161,12 +161,30 @@ static inline struct sk_buff *bt_skb_send_alloc(struct sock *sk, unsigned long l { struct sk_buff *skb; + release_sock(sk); if ((skb = sock_alloc_send_skb(sk, len + BT_SKB_RESERVE, nb, err))) { skb_reserve(skb, BT_SKB_RESERVE); bt_cb(skb)->incoming = 0; } + lock_sock(sk); + + if (!skb && *err) + return NULL; + + *err = sock_error(sk); + if (*err) + goto out; + + if (sk->sk_shutdown) { + *err = -ECONNRESET; + goto out; + } return skb; + +out: + kfree_skb(skb); + return NULL; } int bt_err(__u16 code); diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index ef6c24a529e1..a4dc5b027bd9 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -51,7 +51,8 @@ static inline u32 task_cls_classid(struct task_struct *p) return 0; rcu_read_lock(); - id = rcu_dereference(net_cls_subsys_id); + id = rcu_dereference_index_check(net_cls_subsys_id, + rcu_read_lock_held()); if (id >= 0) classid = container_of(task_subsys_state(p, id), struct cgroup_cls_state, css)->classid; diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index e624dae54fa4..caf17db87dbc 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -75,7 +75,7 @@ struct nf_conntrack_helper; /* nf_conn feature for connections that have a helper */ struct nf_conn_help { /* Helper. if any */ - struct nf_conntrack_helper *helper; + struct nf_conntrack_helper __rcu *helper; union nf_conntrack_help help; diff --git a/init/Kconfig b/init/Kconfig index 1ef0b439908e..7b920aafa98a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -347,6 +347,7 @@ choice config TREE_RCU bool "Tree-based hierarchical RCU" + depends on !PREEMPT && SMP help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -354,7 +355,7 @@ config TREE_RCU smaller systems. config TREE_PREEMPT_RCU - bool "Preemptable tree-based hierarchical RCU" + bool "Preemptible tree-based hierarchical RCU" depends on PREEMPT help This option selects the RCU implementation that is @@ -372,8 +373,22 @@ config TINY_RCU is not required. This option greatly reduces the memory footprint of RCU. +config TINY_PREEMPT_RCU + bool "Preemptible UP-only small-memory-footprint RCU" + depends on !SMP && PREEMPT + help + This option selects the RCU implementation that is designed + for real-time UP systems. This option greatly reduces the + memory footprint of RCU. + endchoice +config PREEMPT_RCU + def_bool ( TREE_PREEMPT_RCU || TINY_PREEMPT_RCU ) + help + This option enables preemptible-RCU code that is common between + the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations. + config RCU_TRACE bool "Enable tracing for RCU" depends on TREE_RCU || TREE_PREEMPT_RCU @@ -394,9 +409,12 @@ config RCU_FANOUT help This option controls the fanout of hierarchical implementations of RCU, allowing RCU to work efficiently on machines with - large numbers of CPUs. This value must be at least the cube - root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit - systems and up to 262,144 for 64-bit systems. + large numbers of CPUs. This value must be at least the fourth + root of NR_CPUS, which allows NR_CPUS to be insanely large. + The default value of RCU_FANOUT should be used for production + systems, but if you are stress-testing the RCU implementation + itself, small RCU_FANOUT values allow you to test large-system + code paths on small(er) systems. Select a specific number if testing RCU itself. Take the default if unsure. diff --git a/kernel/Makefile b/kernel/Makefile index 4d9bf5f8531f..e2c9d52cfe9e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -87,6 +87,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o obj-$(CONFIG_TINY_RCU) += rcutiny.o +obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c9483d8f6140..291ba3d04bea 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -138,7 +138,7 @@ struct css_id { * is called after synchronize_rcu(). But for safe use, css_is_removed() * css_tryget() should be used for avoiding race. */ - struct cgroup_subsys_state *css; + struct cgroup_subsys_state __rcu *css; /* * ID of this css. */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b23c0979bbe7..51b143e2a07a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, if (tsk->flags & PF_THREAD_BOUND) return -EINVAL; - ret = security_task_setscheduler(tsk, 0, NULL); + ret = security_task_setscheduler(tsk); if (ret) return ret; if (threadgroup) { @@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, rcu_read_lock(); list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - ret = security_task_setscheduler(c, 0, NULL); + ret = security_task_setscheduler(c); if (ret) { rcu_read_unlock(); return ret; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 1decafbb6b1a..72206cf5c6cf 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -931,6 +931,7 @@ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) { if (hrtimer_is_queued(timer)) { + unsigned long state; int reprogram; /* @@ -944,8 +945,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); - __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, - reprogram); + /* + * We must preserve the CALLBACK state flag here, + * otherwise we could move the timer base in + * switch_hrtimer_base. + */ + state = timer->state & HRTIMER_STATE_CALLBACK; + __remove_hrtimer(timer, base, state, reprogram); return 1; } return 0; @@ -1231,6 +1237,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); enqueue_hrtimer(timer, base); } + + WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); + timer->state &= ~HRTIMER_STATE_CALLBACK; } diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 0c642d51aac2..53ead174da2f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - __debug_show_held_locks(t); + debug_show_held_locks(t); touch_nmi_watchdog(); @@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * periodically exit the critical section and enter a new one. * * For preemptible RCU it is sufficient to call rcu_read_unlock in order - * exit the grace period. For classic RCU, a reschedule is required. + * to exit the grace period. For classic RCU, a reschedule is required. */ static void rcu_lock_break(struct task_struct *g, struct task_struct *t) { diff --git a/kernel/lockdep.c b/kernel/lockdep.c index f2852a510232..42ba65dff7d9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) } #endif + if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { + debug_locks_off(); + printk(KERN_ERR + "BUG: looking up invalid subclass: %u\n", subclass); + printk(KERN_ERR + "turning off the locking correctness validator.\n"); + dump_stack(); + return NULL; + } + /* * Static locks do not have their class-keys yet - for them the key * is the lock object itself: @@ -774,7 +784,9 @@ out_unlock_set: raw_local_irq_restore(flags); if (!subclass || force) - lock->class_cache = class; + lock->class_cache[0] = class; + else if (subclass < NR_LOCKDEP_CACHING_CLASSES) + lock->class_cache[subclass] = class; if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) return NULL; @@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - lock->class_cache = NULL; + int i; + + for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) + lock->class_cache[i] = NULL; + #ifdef CONFIG_LOCK_STAT lock->cpu = raw_smp_processor_id(); #endif @@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; - if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { - debug_locks_off(); - printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); - return 0; - } - if (lock->key == &__lockdep_no_validate__) check = 1; - if (!subclass) - class = lock->class_cache; + if (subclass < NR_LOCKDEP_CACHING_CLASSES) + class = lock->class_cache[subclass]; /* - * Not cached yet or subclass? + * Not cached? */ if (unlikely(!class)) { class = register_lock_class(lock, subclass, 0); @@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) return 1; if (hlock->references) { - struct lock_class *class = lock->class_cache; + struct lock_class *class = lock->class_cache[0]; if (!class) class = look_up_lock_class(lock, 0); @@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock) if (list_empty(head)) continue; list_for_each_entry_safe(class, next, head, hash_entry) { - if (unlikely(class == lock->class_cache)) { + int match = 0; + + for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) + match |= class == lock->class_cache[j]; + + if (unlikely(match)) { if (debug_locks_off_graph_unlock()) WARN_ON(1); goto out_restore; @@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks); * Careful: only use this function if you are sure that * the task cannot run in parallel! */ -void __debug_show_held_locks(struct task_struct *task) +void debug_show_held_locks(struct task_struct *task) { if (unlikely(!debug_locks)) { printk("INFO: lockdep is turned off.\n"); @@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task) } lockdep_print_held_locks(task); } -EXPORT_SYMBOL_GPL(__debug_show_held_locks); - -void debug_show_held_locks(struct task_struct *task) -{ - __debug_show_held_locks(task); -} EXPORT_SYMBOL_GPL(debug_show_held_locks); void lockdep_sys_exit(void) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 05ecf6f7c672..f309e8014c78 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2509,15 +2509,13 @@ static void perf_event_for_each(struct perf_event *event, static int perf_event_period(struct perf_event *event, u64 __user *arg) { struct perf_event_context *ctx = event->ctx; - unsigned long size; int ret = 0; u64 value; if (!event->attr.sample_period) return -EINVAL; - size = copy_from_user(&value, arg, sizeof(value)); - if (size != sizeof(value)) + if (copy_from_user(&value, arg, sizeof(value))) return -EFAULT; if (!value) diff --git a/kernel/pid.c b/kernel/pid.c index d55c6fb8d087..39b65b69584f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -401,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) struct task_struct *result = NULL; if (pid) { struct hlist_node *first; - first = rcu_dereference_check(pid->tasks[type].first, + first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), rcu_read_lock_held() || lockdep_tasklist_lock_is_held()); if (first) @@ -416,6 +416,7 @@ EXPORT_SYMBOL(pid_task); */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { + rcu_lockdep_assert(rcu_read_lock_held()); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } diff --git a/kernel/printk.c b/kernel/printk.c index 8fe465ac008a..2531017795f6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -85,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress); * provides serialisation for access to the entire console * driver system. */ -static DECLARE_MUTEX(console_sem); +static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); @@ -556,7 +556,7 @@ static void zap_locks(void) /* If a crash is occurring, make sure we can't deadlock */ spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ - init_MUTEX(&console_sem); + sema_init(&console_sem, 1); } #if defined(CONFIG_PRINTK_TIME) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 4d169835fb36..a23a57a976d1 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void) EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); /** - * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? + * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? * * Check for bottom half being disabled, which covers both the * CONFIG_PROVE_RCU and not cases. Note that if someone uses * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) - * will show the situation. + * will show the situation. This is useful for debug checks in functions + * that require that they be called within an RCU read-side critical + * section. * * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. */ @@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; - return in_softirq(); + return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 196ec02f8be0..d806735342ac 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +/* Forward declarations for rcutiny_plugin.h. */ +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); +static void __call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu), + struct rcu_ctrlblk *rcp); + +#include "rcutiny_plugin.h" + #ifdef CONFIG_NO_HZ static long rcu_dynticks_nesting = 1; @@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user) rcu_sched_qs(cpu); else if (!in_softirq()) rcu_bh_qs(cpu); + rcu_preempt_check_callbacks(); } /* @@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) *rcp->donetail = NULL; if (rcp->curtail == rcp->donetail) rcp->curtail = &rcp->rcucblist; + rcu_preempt_remove_callbacks(rcp); rcp->donetail = &rcp->rcucblist; local_irq_restore(flags); @@ -182,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_sched_ctrlblk); __rcu_process_callbacks(&rcu_bh_ctrlblk); + rcu_preempt_process_callbacks(); } /* @@ -223,15 +234,15 @@ static void __call_rcu(struct rcu_head *head, } /* - * Post an RCU callback to be invoked after the end of an RCU grace + * Post an RCU callback to be invoked after the end of an RCU-sched grace * period. But since we have but one CPU, that would be after any * quiescent state. */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { __call_rcu(head, func, &rcu_sched_ctrlblk); } -EXPORT_SYMBOL_GPL(call_rcu); +EXPORT_SYMBOL_GPL(call_rcu_sched); /* * Post an RCU bottom-half callback to be invoked after any subsequent @@ -243,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_bh); -void rcu_barrier(void) -{ - struct rcu_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - void rcu_barrier_bh(void) { struct rcu_synchronize rcu; @@ -289,5 +286,3 @@ void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } - -#include "rcutiny_plugin.h" diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index d223a92bc742..6ceca4f745ff 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -1,7 +1,7 @@ /* - * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition * Internal non-public definitions that provide either classic - * or preemptable semantics. + * or preemptible semantics. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -17,11 +17,587 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - * Copyright IBM Corporation, 2009 + * Copyright (c) 2010 Linaro * * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ +#ifdef CONFIG_TINY_PREEMPT_RCU + +#include <linux/delay.h> + +/* Global control variables for preemptible RCU. */ +struct rcu_preempt_ctrlblk { + struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */ + struct rcu_head **nexttail; + /* Tasks blocked in a preemptible RCU */ + /* read-side critical section while an */ + /* preemptible-RCU grace period is in */ + /* progress must wait for a later grace */ + /* period. This pointer points to the */ + /* ->next pointer of the last task that */ + /* must wait for a later grace period, or */ + /* to &->rcb.rcucblist if there is no */ + /* such task. */ + struct list_head blkd_tasks; + /* Tasks blocked in RCU read-side critical */ + /* section. Tasks are placed at the head */ + /* of this list and age towards the tail. */ + struct list_head *gp_tasks; + /* Pointer to the first task blocking the */ + /* current grace period, or NULL if there */ + /* is not such task. */ + struct list_head *exp_tasks; + /* Pointer to first task blocking the */ + /* current expedited grace period, or NULL */ + /* if there is no such task. If there */ + /* is no current expedited grace period, */ + /* then there cannot be any such task. */ + u8 gpnum; /* Current grace period. */ + u8 gpcpu; /* Last grace period blocked by the CPU. */ + u8 completed; /* Last grace period completed. */ + /* If all three are equal, RCU is idle. */ +}; + +static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { + .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), +}; + +static int rcu_preempted_readers_exp(void); +static void rcu_report_exp_done(void); + +/* + * Return true if the CPU has not yet responded to the current grace period. + */ +static int rcu_cpu_blocking_cur_gp(void) +{ + return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum; +} + +/* + * Check for a running RCU reader. Because there is only one CPU, + * there can be but one running RCU reader at a time. ;-) + */ +static int rcu_preempt_running_reader(void) +{ + return current->rcu_read_lock_nesting; +} + +/* + * Check for preempted RCU readers blocking any grace period. + * If the caller needs a reliable answer, it must disable hard irqs. + */ +static int rcu_preempt_blocked_readers_any(void) +{ + return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks); +} + +/* + * Check for preempted RCU readers blocking the current grace period. + * If the caller needs a reliable answer, it must disable hard irqs. + */ +static int rcu_preempt_blocked_readers_cgp(void) +{ + return rcu_preempt_ctrlblk.gp_tasks != NULL; +} + +/* + * Return true if another preemptible-RCU grace period is needed. + */ +static int rcu_preempt_needs_another_gp(void) +{ + return *rcu_preempt_ctrlblk.rcb.curtail != NULL; +} + +/* + * Return true if a preemptible-RCU grace period is in progress. + * The caller must disable hardirqs. + */ +static int rcu_preempt_gp_in_progress(void) +{ + return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum; +} + +/* + * Record a preemptible-RCU quiescent state for the specified CPU. Note + * that this just means that the task currently running on the CPU is + * in a quiescent state. There might be any number of tasks blocked + * while in an RCU read-side critical section. + * + * Unlike the other rcu_*_qs() functions, callers to this function + * must disable irqs in order to protect the assignment to + * ->rcu_read_unlock_special. + * + * Because this is a single-CPU implementation, the only way a grace + * period can end is if the CPU is in a quiescent state. The reason is + * that a blocked preemptible-RCU reader can exit its critical section + * only if the CPU is running it at the time. Therefore, when the + * last task blocking the current grace period exits its RCU read-side + * critical section, neither the CPU nor blocked tasks will be stopping + * the current grace period. (In contrast, SMP implementations + * might have CPUs running in RCU read-side critical sections that + * block later grace periods -- but this is not possible given only + * one CPU.) + */ +static void rcu_preempt_cpu_qs(void) +{ + /* Record both CPU and task as having responded to current GP. */ + rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; + current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + + /* + * If there is no GP, or if blocked readers are still blocking GP, + * then there is nothing more to do. + */ + if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) + return; + + /* Advance callbacks. */ + rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum; + rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail; + rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail; + + /* If there are no blocked readers, next GP is done instantly. */ + if (!rcu_preempt_blocked_readers_any()) + rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; + + /* If there are done callbacks, make RCU_SOFTIRQ process them. */ + if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) + raise_softirq(RCU_SOFTIRQ); +} + +/* + * Start a new RCU grace period if warranted. Hard irqs must be disabled. + */ +static void rcu_preempt_start_gp(void) +{ + if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) { + + /* Official start of GP. */ + rcu_preempt_ctrlblk.gpnum++; + + /* Any blocked RCU readers block new GP. */ + if (rcu_preempt_blocked_readers_any()) + rcu_preempt_ctrlblk.gp_tasks = + rcu_preempt_ctrlblk.blkd_tasks.next; + + /* If there is no running reader, CPU is done with GP. */ + if (!rcu_preempt_running_reader()) + rcu_preempt_cpu_qs(); + } +} + +/* + * We have entered the scheduler, and the current task might soon be + * context-switched away from. If this task is in an RCU read-side + * critical section, we will no longer be able to rely on the CPU to + * record that fact, so we enqueue the task on the blkd_tasks list. + * If the task started after the current grace period began, as recorded + * by ->gpcpu, we enqueue at the beginning of the list. Otherwise + * before the element referenced by ->gp_tasks (or at the tail if + * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element. + * The task will dequeue itself when it exits the outermost enclosing + * RCU read-side critical section. Therefore, the current grace period + * cannot be permitted to complete until the ->gp_tasks pointer becomes + * NULL. + * + * Caller must disable preemption. + */ +void rcu_preempt_note_context_switch(void) +{ + struct task_struct *t = current; + unsigned long flags; + + local_irq_save(flags); /* must exclude scheduler_tick(). */ + if (rcu_preempt_running_reader() && + (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { + + /* Possibly blocking in an RCU read-side critical section. */ + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; + + /* + * If this CPU has already checked in, then this task + * will hold up the next grace period rather than the + * current grace period. Queue the task accordingly. + * If the task is queued for the current grace period + * (i.e., this CPU has not yet passed through a quiescent + * state for the current grace period), then as long + * as that task remains queued, the current grace period + * cannot end. + */ + list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); + if (rcu_cpu_blocking_cur_gp()) + rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; + } + + /* + * Either we were not in an RCU read-side critical section to + * begin with, or we have now recorded that critical section + * globally. Either way, we can now note a quiescent state + * for this CPU. Again, if we were in an RCU read-side critical + * section, and if that critical section was blocking the current + * grace period, then the fact that the task has been enqueued + * means that current grace period continues to be blocked. + */ + rcu_preempt_cpu_qs(); + local_irq_restore(flags); +} + +/* + * Tiny-preemptible RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + current->rcu_read_lock_nesting++; + barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +/* + * Handle special cases during rcu_read_unlock(), such as needing to + * notify RCU core processing or task having blocked during the RCU + * read-side critical section. + */ +static void rcu_read_unlock_special(struct task_struct *t) +{ + int empty; + int empty_exp; + unsigned long flags; + struct list_head *np; + int special; + + /* + * NMI handlers cannot block and cannot safely manipulate state. + * They therefore cannot possibly be special, so just leave. + */ + if (in_nmi()) + return; + + local_irq_save(flags); + + /* + * If RCU core is waiting for this CPU to exit critical section, + * let it know that we have done so. + */ + special = t->rcu_read_unlock_special; + if (special & RCU_READ_UNLOCK_NEED_QS) + rcu_preempt_cpu_qs(); + + /* Hardware IRQ handlers cannot block. */ + if (in_irq()) { + local_irq_restore(flags); + return; + } + + /* Clean up if blocked during RCU read-side critical section. */ + if (special & RCU_READ_UNLOCK_BLOCKED) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; + + /* + * Remove this task from the ->blkd_tasks list and adjust + * any pointers that might have been referencing it. + */ + empty = !rcu_preempt_blocked_readers_cgp(); + empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; + np = t->rcu_node_entry.next; + if (np == &rcu_preempt_ctrlblk.blkd_tasks) + np = NULL; + list_del(&t->rcu_node_entry); + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) + rcu_preempt_ctrlblk.gp_tasks = np; + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) + rcu_preempt_ctrlblk.exp_tasks = np; + INIT_LIST_HEAD(&t->rcu_node_entry); + + /* + * If this was the last task on the current list, and if + * we aren't waiting on the CPU, report the quiescent state + * and start a new grace period if needed. + */ + if (!empty && !rcu_preempt_blocked_readers_cgp()) { + rcu_preempt_cpu_qs(); + rcu_preempt_start_gp(); + } + + /* + * If this was the last task on the expedited lists, + * then we need wake up the waiting task. + */ + if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) + rcu_report_exp_done(); + } + local_irq_restore(flags); +} + +/* + * Tiny-preemptible RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ + --t->rcu_read_lock_nesting; + barrier(); /* decrement before load of ->rcu_read_unlock_special */ + if (t->rcu_read_lock_nesting == 0 && + unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); +#ifdef CONFIG_PROVE_LOCKING + WARN_ON_ONCE(t->rcu_read_lock_nesting < 0); +#endif /* #ifdef CONFIG_PROVE_LOCKING */ +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + +/* + * Check for a quiescent state from the current CPU. When a task blocks, + * the task is recorded in the rcu_preempt_ctrlblk structure, which is + * checked elsewhere. This is called from the scheduling-clock interrupt. + * + * Caller must disable hard irqs. + */ +static void rcu_preempt_check_callbacks(void) +{ + struct task_struct *t = current; + + if (rcu_preempt_gp_in_progress() && + (!rcu_preempt_running_reader() || + !rcu_cpu_blocking_cur_gp())) + rcu_preempt_cpu_qs(); + if (&rcu_preempt_ctrlblk.rcb.rcucblist != + rcu_preempt_ctrlblk.rcb.donetail) + raise_softirq(RCU_SOFTIRQ); + if (rcu_preempt_gp_in_progress() && + rcu_cpu_blocking_cur_gp() && + rcu_preempt_running_reader()) + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; +} + +/* + * TINY_PREEMPT_RCU has an extra callback-list tail pointer to + * update, so this is invoked from __rcu_process_callbacks() to + * handle that case. Of course, it is invoked for all flavors of + * RCU, but RCU callbacks can appear only on one of the lists, and + * neither ->nexttail nor ->donetail can possibly be NULL, so there + * is no need for an explicit check. + */ +static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) +{ + if (rcu_preempt_ctrlblk.nexttail == rcp->donetail) + rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist; +} + +/* + * Process callbacks for preemptible RCU. + */ +static void rcu_preempt_process_callbacks(void) +{ + __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); +} + +/* + * Queue a preemptible -RCU callback for invocation after a grace period. + */ +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + + debug_rcu_head_queue(head); + head->func = func; + head->next = NULL; + + local_irq_save(flags); + *rcu_preempt_ctrlblk.nexttail = head; + rcu_preempt_ctrlblk.nexttail = &head->next; + rcu_preempt_start_gp(); /* checks to see if GP needed. */ + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(call_rcu); + +void rcu_barrier(void) +{ + struct rcu_synchronize rcu; + + init_rcu_head_on_stack(&rcu.head); + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +/* + * synchronize_rcu - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + */ +void synchronize_rcu(void) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (!rcu_scheduler_active) + return; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + + WARN_ON_ONCE(rcu_preempt_running_reader()); + if (!rcu_preempt_blocked_readers_any()) + return; + + /* Once we get past the fastpath checks, same code as rcu_barrier(). */ + rcu_barrier(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu); + +static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); +static unsigned long sync_rcu_preempt_exp_count; +static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); + +/* + * Return non-zero if there are any tasks in RCU read-side critical + * sections blocking the current preemptible-RCU expedited grace period. + * If there is no preemptible-RCU expedited grace period currently in + * progress, returns zero unconditionally. + */ +static int rcu_preempted_readers_exp(void) +{ + return rcu_preempt_ctrlblk.exp_tasks != NULL; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. + */ +static void rcu_report_exp_done(void) +{ + wake_up(&sync_rcu_preempt_exp_wq); +} + +/* + * Wait for an rcu-preempt grace period, but expedite it. The basic idea + * is to rely in the fact that there is but one CPU, and that it is + * illegal for a task to invoke synchronize_rcu_expedited() while in a + * preemptible-RCU read-side critical section. Therefore, any such + * critical sections must correspond to blocked tasks, which must therefore + * be on the ->blkd_tasks list. So just record the current head of the + * list in the ->exp_tasks pointer, and wait for all tasks including and + * after the task pointed to by ->exp_tasks to drain. + */ +void synchronize_rcu_expedited(void) +{ + unsigned long flags; + struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk; + unsigned long snap; + + barrier(); /* ensure prior action seen before grace period. */ + + WARN_ON_ONCE(rcu_preempt_running_reader()); + + /* + * Acquire lock so that there is only one preemptible RCU grace + * period in flight. Of course, if someone does the expedited + * grace period for us while we are acquiring the lock, just leave. + */ + snap = sync_rcu_preempt_exp_count + 1; + mutex_lock(&sync_rcu_preempt_exp_mutex); + if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count)) + goto unlock_mb_ret; /* Others did our work for us. */ + + local_irq_save(flags); + + /* + * All RCU readers have to already be on blkd_tasks because + * we cannot legally be executing in an RCU read-side critical + * section. + */ + + /* Snapshot current head of ->blkd_tasks list. */ + rpcp->exp_tasks = rpcp->blkd_tasks.next; + if (rpcp->exp_tasks == &rpcp->blkd_tasks) + rpcp->exp_tasks = NULL; + local_irq_restore(flags); + + /* Wait for tail of ->blkd_tasks list to drain. */ + if (rcu_preempted_readers_exp()) + wait_event(sync_rcu_preempt_exp_wq, + !rcu_preempted_readers_exp()); + + /* Clean up and exit. */ + barrier(); /* ensure expedited GP seen before counter increment. */ + sync_rcu_preempt_exp_count++; +unlock_mb_ret: + mutex_unlock(&sync_rcu_preempt_exp_mutex); + barrier(); /* ensure subsequent action seen after grace period. */ +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* + * Does preemptible RCU need the CPU to stay out of dynticks mode? + */ +int rcu_preempt_needs_cpu(void) +{ + if (!rcu_preempt_running_reader()) + rcu_preempt_cpu_qs(); + return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; +} + +/* + * Check for a task exiting while in a preemptible -RCU read-side + * critical section, clean up if so. No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting == 0) + return; + t->rcu_read_lock_nesting = 1; + rcu_read_unlock(); +} + +#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to check. + */ +static void rcu_preempt_check_callbacks(void) +{ +} + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to remove. + */ +static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) +{ +} + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to process. + */ +static void rcu_preempt_process_callbacks(void) +{ +} + +#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC #include <linux/kernel_stat.h> diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 2e2726d790b9..9d8e8fb2515f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -120,7 +120,7 @@ struct rcu_torture { }; static LIST_HEAD(rcu_torture_freelist); -static struct rcu_torture *rcu_torture_current; +static struct rcu_torture __rcu *rcu_torture_current; static long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); @@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ static int fullstop = FULLSTOP_RMMOD; -DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ - /* of kthreads. */ +/* + * Protect fullstop transitions and spawning of kthreads. + */ +static DEFINE_MUTEX(fullstop_mutex); /* * Detect and respond to a system shutdown. @@ -303,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp) mdelay(longdelay_ms); if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) + preempt_schedule(); /* No QS if preempt_disable() in effect */ +#endif } static void rcu_torture_read_unlock(int idx) __releases(RCU) @@ -536,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp) delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); if (!delay) schedule_timeout_interruptible(longdelay); + else + rcu_read_delay(rrsp); } static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) @@ -731,7 +739,8 @@ rcu_torture_writer(void *arg) continue; rp->rtort_pipe_count = 0; udelay(rcu_random(&rand) & 0x3ff); - old_rp = rcu_torture_current; + old_rp = rcu_dereference_check(rcu_torture_current, + current == writer_task); rp->rtort_mbtest = 1; rcu_assign_pointer(rcu_torture_current, rp); smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d5bc43976c5a..ccdc04c47981 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -143,6 +143,11 @@ module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT; +module_param(rcu_cpu_stall_suppress, int, 0644); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #ifdef CONFIG_RCU_CPU_STALL_DETECTOR -int rcu_cpu_stall_panicking __read_mostly; +int rcu_cpu_stall_suppress __read_mostly; static void record_gp_stall_check_time(struct rcu_state *rsp) { @@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp) rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); - /* OK, time to rat on our buddy... */ - + /* + * OK, time to rat on our buddy... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", rsp->name); rcu_for_each_leaf_node(rsp, rnp) { @@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp) unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); + /* + * OK, time to rat on ourselves... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", rsp->name, smp_processor_id(), jiffies - rsp->gp_start); trigger_all_cpu_backtrace(); @@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) long delta; struct rcu_node *rnp; - if (rcu_cpu_stall_panicking) + if (rcu_cpu_stall_suppress) return; - delta = jiffies - rsp->jiffies_stall; + delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); rnp = rdp->mynode; - if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { + if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); @@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) { - rcu_cpu_stall_panicking = 1; + rcu_cpu_stall_suppress = 1; return NOTIFY_DONE; } +/** + * rcu_cpu_stall_reset - prevent further stall warnings in current grace period + * + * Set the stall-warning timeout way off into the future, thus preventing + * any RCU CPU stall-warning messages from appearing in the current set of + * RCU grace periods. + * + * The caller must disable hard irqs. + */ +void rcu_cpu_stall_reset(void) +{ + rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; + rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; + rcu_preempt_stall_reset(); +} + static struct notifier_block rcu_panic_block = { .notifier_call = rcu_panic, }; @@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { } +void rcu_cpu_stall_reset(void) +{ +} + static void __init check_cpu_stall_init(void) { } @@ -712,7 +745,7 @@ static void rcu_start_gp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { - struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { @@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) { int i; - struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); if (rdp->nxtlist == NULL) return; /* irqs disabled, so comparison is stable. */ @@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; rsp->orphan_qlen += rdp->qlen; + rdp->n_cbs_orphaned += rdp->qlen; rdp->qlen = 0; raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ } @@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) struct rcu_data *rdp; raw_spin_lock_irqsave(&rsp->onofflock, flags); - rdp = rsp->rda[smp_processor_id()]; + rdp = this_cpu_ptr(rsp->rda); if (rsp->orphan_cbs_list == NULL) { raw_spin_unlock_irqrestore(&rsp->onofflock, flags); return; @@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; rdp->qlen += rsp->orphan_qlen; + rdp->n_cbs_adopted += rsp->orphan_qlen; rsp->orphan_cbs_list = NULL; rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; rsp->orphan_qlen = 0; @@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) unsigned long flags; unsigned long mask; int need_report = 0; - struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp; /* Exclude any attempts to start a new grace period. */ @@ -1123,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Update count, and requeue any remaining callbacks. */ rdp->qlen -= count; + rdp->n_cbs_invoked += count; if (list != NULL) { *tail = rdp->nxtlist; rdp->nxtlist = list; @@ -1226,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) cpu = rnp->grplo; bit = 1; for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { - if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) + if ((rnp->qsmask & bit) != 0 && + f(per_cpu_ptr(rsp->rda, cpu))) mask |= bit; } if (mask != 0) { @@ -1402,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * a quiescent state betweentimes. */ local_irq_save(flags); - rdp = rsp->rda[smp_processor_id()]; + rdp = this_cpu_ptr(rsp->rda); rcu_process_gp_end(rsp, rdp); check_for_new_grace_period(rsp, rdp); @@ -1701,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; int i; - struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ @@ -1729,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) { unsigned long flags; unsigned long mask; - struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ @@ -1865,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) /* * Helper function for rcu_init() that initializes one rcu_state structure. */ -static void __init rcu_init_one(struct rcu_state *rsp) +static void __init rcu_init_one(struct rcu_state *rsp, + struct rcu_data __percpu *rda) { static char *buf[] = { "rcu_node_level_0", "rcu_node_level_1", @@ -1918,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp) } } + rsp->rda = rda; rnp = rsp->level[NUM_RCU_LVLS - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) rnp++; - rsp->rda[i]->mynode = rnp; + per_cpu_ptr(rsp->rda, i)->mynode = rnp; rcu_boot_init_percpu_data(i, rsp); } } -/* - * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used - * nowhere else! Assigns leaf node pointers into each CPU's rcu_data - * structure. - */ -#define RCU_INIT_FLAVOR(rsp, rcu_data) \ -do { \ - int i; \ - \ - for_each_possible_cpu(i) { \ - (rsp)->rda[i] = &per_cpu(rcu_data, i); \ - } \ - rcu_init_one(rsp); \ -} while (0) - void __init rcu_init(void) { int cpu; rcu_bootup_announce(); - RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); - RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); + rcu_init_one(&rcu_sched_state, &rcu_sched_data); + rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 14c040b18ed0..91d4170c5c13 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -202,6 +202,9 @@ struct rcu_data { long qlen; /* # of queued callbacks */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ + unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ + unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ + unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ @@ -254,19 +257,23 @@ struct rcu_data { #define RCU_STALL_DELAY_DELTA 0 #endif -#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) +#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \ + RCU_STALL_DELAY_DELTA) /* for rsp->jiffies_stall */ -#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) +#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30) /* for rsp->jiffies_stall */ #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ /* to take at least one */ /* scheduling clock irq */ /* before ratting on them. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE +#define RCU_CPU_STALL_SUPPRESS_INIT 0 +#else +#define RCU_CPU_STALL_SUPPRESS_INIT 1 +#endif -#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) -#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* * RCU global state, including node hierarchy. This hierarchy is @@ -283,7 +290,7 @@ struct rcu_state { struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ - struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ + struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ /* The following fields are guarded by the root rcu_node's lock. */ @@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, #ifdef CONFIG_RCU_CPU_STALL_DETECTOR static void rcu_print_detail_task_stall(struct rcu_state *rsp); static void rcu_print_task_stall(struct rcu_node *rnp); +static void rcu_preempt_stall_reset(void); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 0e4f420245d9..71a4147473f9 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void) printk(KERN_INFO "\tRCU-based detection of stalled CPUs is disabled.\n"); #endif -#ifndef CONFIG_RCU_CPU_STALL_VERBOSE +#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); #endif #if NUM_RCU_LVL_4 != 0 @@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu) (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = rcu_preempt_state.rda[cpu]; + rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; @@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu) */ void __rcu_read_lock(void) { - ACCESS_ONCE(current->rcu_read_lock_nesting)++; + current->rcu_read_lock_nesting++; barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); @@ -344,7 +344,9 @@ void __rcu_read_unlock(void) struct task_struct *t = current; barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ - if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && + --t->rcu_read_lock_nesting; + barrier(); /* decrement before load of ->rcu_read_unlock_special */ + if (t->rcu_read_lock_nesting == 0 && unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) rcu_read_unlock_special(t); #ifdef CONFIG_PROVE_LOCKING @@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp) } } +/* + * Suppress preemptible RCU's CPU stall warnings by pushing the + * time of the next stall-warning message comfortably far into the + * future. + */ +static void rcu_preempt_stall_reset(void) +{ + rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; +} + #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* @@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu); * * Control will return to the caller some time after a full grace * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. + * read-side critical sections have completed. Note, however, that + * upon return from synchronize_rcu(), the caller might well be executing + * concurrently with new RCU read-side critical sections that began while + * synchronize_rcu() was waiting. RCU read-side critical sections are + * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. */ void synchronize_rcu(void) { @@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void) */ static void __init __rcu_init_preempt(void) { - RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); + rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); } /* @@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp) { } +/* + * Because preemptible RCU does not exist, there is no need to suppress + * its CPU stall warnings. + */ +static void rcu_preempt_stall_reset(void) +{ +} + #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* @@ -919,15 +941,6 @@ static void rcu_preempt_process_callbacks(void) } /* - * In classic RCU, call_rcu() is just call_rcu_sched(). - */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - call_rcu_sched(head, func); -} -EXPORT_SYMBOL_GPL(call_rcu); - -/* * Wait for an rcu-preempt grace period, but make it happen quickly. * But because preemptable RCU does not exist, map to rcu-sched. */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 36c95b45738e..d15430b9d122 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit); + seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); + seq_printf(m, " ci=%lu co=%lu ca=%lu\n", + rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } #define PRINT_RCU_DATA(name, func, m) \ @@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit); + seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); + seq_printf(m, ",%lu,%lu,%lu\n", + rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } static int show_rcudata_csv(struct seq_file *m, void *unused) @@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) #ifdef CONFIG_NO_HZ seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); #endif /* #ifdef CONFIG_NO_HZ */ - seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); + seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); #ifdef CONFIG_TREE_PREEMPT_RCU seq_puts(m, "\"rcu_preempt:\"\n"); PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); @@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) struct rcu_data *rdp; for_each_possible_cpu(cpu) { - rdp = rsp->rda[cpu]; + rdp = per_cpu_ptr(rsp->rda, cpu); if (rdp->beenonline) print_one_rcu_pending(m, rdp); } diff --git a/kernel/sched.c b/kernel/sched.c index c0d2067f3e0d..5a5cc33e4999 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4645,7 +4645,7 @@ recheck: } if (user) { - retval = security_task_setscheduler(p, policy, param); + retval = security_task_setscheduler(p); if (retval) return retval; } @@ -4887,7 +4887,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) goto out_unlock; - retval = security_task_setscheduler(p, 0, NULL); + retval = security_task_setscheduler(p); if (retval) goto out_unlock; @@ -5337,7 +5337,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); + /* + * We're having a chicken and egg problem, even though we are + * holding rq->lock, the cpu isn't yet set to this cpu so the + * lockdep check in task_group() will fail. + * + * Similar case to sched_fork(). / Alternatively we could + * use task_rq_lock() here and obtain the other rq->lock. + * + * Silence PROVE_RCU + */ + rcu_read_lock(); __set_task_cpu(idle, cpu); + rcu_read_unlock(); rq->curr = rq->idle = idle; #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index db3f674ca49d..5f996d36ac5d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3751,8 +3751,11 @@ static void task_fork_fair(struct task_struct *p) update_rq_clock(rq); - if (unlikely(task_cpu(p) != this_cpu)) + if (unlikely(task_cpu(p) != this_cpu)) { + rcu_read_lock(); __set_task_cpu(p, this_cpu); + rcu_read_unlock(); + } update_curr(cfs_rq); diff --git a/kernel/signal.c b/kernel/signal.c index bded65187780..919562c3d6b7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2215,6 +2215,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) #ifdef __ARCH_SI_TRAPNO err |= __put_user(from->si_trapno, &to->si_trapno); #endif +#ifdef BUS_MCEERR_AO + /* + * Other callers might not initialize the si_lsb field, + * so check explicitely for the right codes here. + */ + if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) + err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); +#endif break; case __SI_CHLD: err |= __put_user(from->si_pid, &to->si_pid); diff --git a/kernel/srcu.c b/kernel/srcu.c index 2980da3fd509..c71e07500536 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) int __init_srcu_struct(struct srcu_struct *sp, const char *name, struct lock_class_key *key) { -#ifdef CONFIG_DEBUG_LOCK_ALLOC /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)sp, sizeof(*sp)); lockdep_init_map(&sp->dep_map, name, key, 0); -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ return init_srcu_struct_fields(sp); } EXPORT_SYMBOL_GPL(__init_srcu_struct); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f88552c6d227..3a45c224770f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2485,7 +2485,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int kbuf[left] = 0; } - for (; left && vleft--; i++, min++, max++, first=0) { + for (; left && vleft--; i++, first = 0) { unsigned long val; if (write) { diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 04cdcf72c827..10b90d8a03c4 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -143,15 +143,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) if (!table->maxlen) set_fail(&fail, table, "No maxlen"); } - if ((table->proc_handler == proc_doulongvec_minmax) || - (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { - if (table->maxlen > sizeof (unsigned long)) { - if (!table->extra1) - set_fail(&fail, table, "No min"); - if (!table->extra2) - set_fail(&fail, table, "No max"); - } - } #ifdef CONFIG_PROC_SYSCTL if (table->procname && !table->proc_handler) set_fail(&fail, table, "No proc_handler"); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 4e2f03410377..c5a632a669e1 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -405,7 +405,7 @@ static inline int test_time_stamp(u64 delta) #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) /* Max number of timestamps that can fit on a page */ -#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP) +#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND) int ring_buffer_print_page_header(struct trace_seq *s) { diff --git a/kernel/watchdog.c b/kernel/watchdog.c index dc8e16824b51..bafba687a6d8 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -196,7 +196,7 @@ static struct perf_event_attr wd_hw_attr = { }; /* Callback function for perf event subsystem */ -void watchdog_overflow_callback(struct perf_event *event, int nmi, +static void watchdog_overflow_callback(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e85d549b6eac..21ac83070a80 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -540,6 +540,23 @@ config PROVE_RCU_REPEATEDLY disabling, allowing multiple RCU-lockdep warnings to be printed on a single reboot. + Say Y to allow multiple RCU-lockdep warnings per boot. + + Say N if you are unsure. + +config SPARSE_RCU_POINTER + bool "RCU debugging: sparse-based checks for pointer usage" + default n + help + This feature enables the __rcu sparse annotation for + RCU-protected pointers. This annotation will cause sparse + to flag any non-RCU used of annotated pointers. This can be + helpful when debugging RCU usage. Please note that this feature + is not intended to enforce code cleanliness; it is instead merely + a debugging aid. + + Say Y to make sparse flag questionable use of RCU-protected pointers + Say N if you are unsure. config LOCKDEP @@ -832,6 +849,30 @@ config RCU_CPU_STALL_DETECTOR Say Y if you are unsure. +config RCU_CPU_STALL_TIMEOUT + int "RCU CPU stall timeout in seconds" + depends on RCU_CPU_STALL_DETECTOR + range 3 300 + default 60 + help + If a given RCU grace period extends more than the specified + number of seconds, a CPU stall warning is printed. If the + RCU grace period persists, additional CPU stall warnings are + printed at more widely spaced intervals. + +config RCU_CPU_STALL_DETECTOR_RUNNABLE + bool "RCU CPU stall checking starts automatically at boot" + depends on RCU_CPU_STALL_DETECTOR + default y + help + If set, start checking for RCU CPU stalls immediately on + boot. Otherwise, RCU CPU stall checking must be manually + enabled. + + Say Y if you are unsure. + + Say N if you wish to suppress RCU CPU stall checking during boot. + config RCU_CPU_STALL_VERBOSE bool "Print additional per-task information for RCU_CPU_STALL_DETECTOR" depends on RCU_CPU_STALL_DETECTOR && TREE_PREEMPT_RCU diff --git a/lib/radix-tree.c b/lib/radix-tree.c index efd16fa80b1c..6f412ab4c24f 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -49,7 +49,7 @@ struct radix_tree_node { unsigned int height; /* Height from the bottom */ unsigned int count; struct rcu_head rcu_head; - void *slots[RADIX_TREE_MAP_SIZE]; + void __rcu *slots[RADIX_TREE_MAP_SIZE]; unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3eed583895a6..9be3cf8a5da4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3587,9 +3587,13 @@ unlock: static void mem_cgroup_threshold(struct mem_cgroup *memcg) { - __mem_cgroup_threshold(memcg, false); - if (do_swap_account) - __mem_cgroup_threshold(memcg, true); + while (memcg) { + __mem_cgroup_threshold(memcg, false); + if (do_swap_account) + __mem_cgroup_threshold(memcg, true); + + memcg = parent_mem_cgroup(memcg); + } } static int compare_thresholds(const void *a, const void *b) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9c26eeca1342..757f6b0accfe 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(hwpoison_filter); * signal. */ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, - unsigned long pfn) + unsigned long pfn, struct page *page) { struct siginfo si; int ret; @@ -198,7 +198,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = PAGE_SHIFT; + si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; /* * Don't use force here, it's convenient if the signal * can be temporarily blocked. @@ -235,7 +235,7 @@ void shake_page(struct page *p, int access) int nr; do { nr = shrink_slab(1000, GFP_KERNEL, 1000); - if (page_count(p) == 0) + if (page_count(p) == 1) break; } while (nr > 10); } @@ -327,7 +327,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * wrong earlier. */ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, - int fail, unsigned long pfn) + int fail, struct page *page, unsigned long pfn) { struct to_kill *tk, *next; @@ -352,7 +352,7 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, * process anyways. */ else if (kill_proc_ao(tk->tsk, tk->addr, trapno, - pfn) < 0) + pfn, page) < 0) printk(KERN_ERR "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", pfn, tk->tsk->comm, tk->tsk->pid); @@ -928,7 +928,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * any accesses to the poisoned memory. */ kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, - ret != SWAP_SUCCESS, pfn); + ret != SWAP_SUCCESS, p, pfn); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a8cfa9cc6e86..f12ad1836abe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5182,9 +5182,9 @@ void *__init alloc_large_system_hash(const char *tablename, if (!table) panic("Failed to allocate %s hash table\n", tablename); - printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", + printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", tablename, - (1U << log2qty), + (1UL << log2qty), ilog2(size) - PAGE_SHIFT, size); diff --git a/net/Kconfig b/net/Kconfig index e926884c1675..55fd82e9ffd9 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -293,6 +293,7 @@ source "net/wimax/Kconfig" source "net/rfkill/Kconfig" source "net/9p/Kconfig" source "net/caif/Kconfig" +source "net/ceph/Kconfig" endif # if NET diff --git a/net/Makefile b/net/Makefile index ea60fbce9b1b..6b7bfd7f1416 100644 --- a/net/Makefile +++ b/net/Makefile @@ -68,3 +68,4 @@ obj-$(CONFIG_SYSCTL) += sysctl_net.o endif obj-$(CONFIG_WIMAX) += wimax/ obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/ +obj-$(CONFIG_CEPH_LIB) += ceph/ diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 622b471e14e0..74bcc662c3dd 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -778,7 +778,7 @@ static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb) eg->packets_rcvd++; mpc->eg_ops->put(eg); - memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); + memset(ATM_SKB(new_skb), 0, sizeof(struct atm_skb_data)); netif_rx(new_skb); } diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index fadf26b4ed7c..0b54b7dd8401 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -1441,33 +1441,23 @@ static inline void l2cap_do_send(struct sock *sk, struct sk_buff *skb) static void l2cap_streaming_send(struct sock *sk) { - struct sk_buff *skb, *tx_skb; + struct sk_buff *skb; struct l2cap_pinfo *pi = l2cap_pi(sk); u16 control, fcs; - while ((skb = sk->sk_send_head)) { - tx_skb = skb_clone(skb, GFP_ATOMIC); - - control = get_unaligned_le16(tx_skb->data + L2CAP_HDR_SIZE); + while ((skb = skb_dequeue(TX_QUEUE(sk)))) { + control = get_unaligned_le16(skb->data + L2CAP_HDR_SIZE); control |= pi->next_tx_seq << L2CAP_CTRL_TXSEQ_SHIFT; - put_unaligned_le16(control, tx_skb->data + L2CAP_HDR_SIZE); + put_unaligned_le16(control, skb->data + L2CAP_HDR_SIZE); if (pi->fcs == L2CAP_FCS_CRC16) { - fcs = crc16(0, (u8 *)tx_skb->data, tx_skb->len - 2); - put_unaligned_le16(fcs, tx_skb->data + tx_skb->len - 2); + fcs = crc16(0, (u8 *)skb->data, skb->len - 2); + put_unaligned_le16(fcs, skb->data + skb->len - 2); } - l2cap_do_send(sk, tx_skb); + l2cap_do_send(sk, skb); pi->next_tx_seq = (pi->next_tx_seq + 1) % 64; - - if (skb_queue_is_last(TX_QUEUE(sk), skb)) - sk->sk_send_head = NULL; - else - sk->sk_send_head = skb_queue_next(TX_QUEUE(sk), skb); - - skb = skb_dequeue(TX_QUEUE(sk)); - kfree_skb(skb); } } @@ -1960,6 +1950,11 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, char __us switch (optname) { case L2CAP_OPTIONS: + if (sk->sk_state == BT_CONNECTED) { + err = -EINVAL; + break; + } + opts.imtu = l2cap_pi(sk)->imtu; opts.omtu = l2cap_pi(sk)->omtu; opts.flush_to = l2cap_pi(sk)->flush_to; @@ -2771,10 +2766,10 @@ static int l2cap_parse_conf_rsp(struct sock *sk, void *rsp, int len, void *data, case L2CAP_CONF_MTU: if (val < L2CAP_DEFAULT_MIN_MTU) { *result = L2CAP_CONF_UNACCEPT; - pi->omtu = L2CAP_DEFAULT_MIN_MTU; + pi->imtu = L2CAP_DEFAULT_MIN_MTU; } else - pi->omtu = val; - l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, pi->omtu); + pi->imtu = val; + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, pi->imtu); break; case L2CAP_CONF_FLUSH_TO: @@ -3071,6 +3066,17 @@ static inline int l2cap_connect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hd return 0; } +static inline void set_default_fcs(struct l2cap_pinfo *pi) +{ + /* FCS is enabled only in ERTM or streaming mode, if one or both + * sides request it. + */ + if (pi->mode != L2CAP_MODE_ERTM && pi->mode != L2CAP_MODE_STREAMING) + pi->fcs = L2CAP_FCS_NONE; + else if (!(pi->conf_state & L2CAP_CONF_NO_FCS_RECV)) + pi->fcs = L2CAP_FCS_CRC16; +} + static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data) { struct l2cap_conf_req *req = (struct l2cap_conf_req *) data; @@ -3088,14 +3094,8 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr if (!sk) return -ENOENT; - if (sk->sk_state != BT_CONFIG) { - struct l2cap_cmd_rej rej; - - rej.reason = cpu_to_le16(0x0002); - l2cap_send_cmd(conn, cmd->ident, L2CAP_COMMAND_REJ, - sizeof(rej), &rej); + if (sk->sk_state == BT_DISCONN) goto unlock; - } /* Reject if config buffer is too small. */ len = cmd_len - sizeof(*req); @@ -3135,9 +3135,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr goto unlock; if (l2cap_pi(sk)->conf_state & L2CAP_CONF_INPUT_DONE) { - if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_NO_FCS_RECV) || - l2cap_pi(sk)->fcs != L2CAP_FCS_NONE) - l2cap_pi(sk)->fcs = L2CAP_FCS_CRC16; + set_default_fcs(l2cap_pi(sk)); sk->sk_state = BT_CONNECTED; @@ -3225,9 +3223,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr l2cap_pi(sk)->conf_state |= L2CAP_CONF_INPUT_DONE; if (l2cap_pi(sk)->conf_state & L2CAP_CONF_OUTPUT_DONE) { - if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_NO_FCS_RECV) || - l2cap_pi(sk)->fcs != L2CAP_FCS_NONE) - l2cap_pi(sk)->fcs = L2CAP_FCS_CRC16; + set_default_fcs(l2cap_pi(sk)); sk->sk_state = BT_CONNECTED; l2cap_pi(sk)->next_tx_seq = 0; diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 44a623275951..194b3a04cfd3 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -82,11 +82,14 @@ static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb) static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) { struct sock *sk = d->owner, *parent; + unsigned long flags; + if (!sk) return; BT_DBG("dlc %p state %ld err %d", d, d->state, err); + local_irq_save(flags); bh_lock_sock(sk); if (err) @@ -108,6 +111,7 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) } bh_unlock_sock(sk); + local_irq_restore(flags); if (parent && sock_flag(sk, SOCK_ZAPPED)) { /* We have to drop DLC lock here, otherwise diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 8ce904786116..4bf28f25f368 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -827,6 +827,7 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr, long timeo; int err; int ifindex, headroom, tailroom; + unsigned int mtu; struct net_device *dev; lock_sock(sk); @@ -896,15 +897,23 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr, cf_sk->sk.sk_state = CAIF_DISCONNECTED; goto out; } - dev = dev_get_by_index(sock_net(sk), ifindex); + + err = -ENODEV; + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), ifindex); + if (!dev) { + rcu_read_unlock(); + goto out; + } cf_sk->headroom = LL_RESERVED_SPACE_EXTRA(dev, headroom); + mtu = dev->mtu; + rcu_read_unlock(); + cf_sk->tailroom = tailroom; - cf_sk->maxframe = dev->mtu - (headroom + tailroom); - dev_put(dev); + cf_sk->maxframe = mtu - (headroom + tailroom); if (cf_sk->maxframe < 1) { - pr_warning("CAIF: %s(): CAIF Interface MTU too small (%d)\n", - __func__, dev->mtu); - err = -ENODEV; + pr_warning("CAIF: %s(): CAIF Interface MTU too small (%u)\n", + __func__, mtu); goto out; } diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig new file mode 100644 index 000000000000..ad424049b0cf --- /dev/null +++ b/net/ceph/Kconfig @@ -0,0 +1,28 @@ +config CEPH_LIB + tristate "Ceph core library (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + select LIBCRC32C + select CRYPTO_AES + select CRYPTO + default n + help + Choose Y or M here to include cephlib, which provides the + common functionality to both the Ceph filesystem and + to the rados block device (rbd). + + More information at http://ceph.newdream.net/. + + If unsure, say N. + +config CEPH_LIB_PRETTYDEBUG + bool "Include file:line in ceph debug output" + depends on CEPH_LIB + default n + help + If you say Y here, debug output will include a filename and + line to aid debugging. This increases kernel size and slows + execution slightly when debug call sites are enabled (e.g., + via CONFIG_DYNAMIC_DEBUG). + + If unsure, say N. + diff --git a/net/ceph/Makefile b/net/ceph/Makefile new file mode 100644 index 000000000000..aab1cabb8035 --- /dev/null +++ b/net/ceph/Makefile @@ -0,0 +1,37 @@ +# +# Makefile for CEPH filesystem. +# + +ifneq ($(KERNELRELEASE),) + +obj-$(CONFIG_CEPH_LIB) += libceph.o + +libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ + mon_client.o \ + osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ + debugfs.o \ + auth.o auth_none.o \ + crypto.o armor.o \ + auth_x.o \ + ceph_fs.o ceph_strings.o ceph_hash.o \ + pagevec.o + +else +#Otherwise we were called directly from the command +# line; invoke the kernel build system. + +KERNELDIR ?= /lib/modules/$(shell uname -r)/build +PWD := $(shell pwd) + +default: all + +all: + $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules + +modules_install: + $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules_install + +clean: + $(MAKE) -C $(KERNELDIR) M=$(PWD) clean + +endif diff --git a/fs/ceph/armor.c b/net/ceph/armor.c index eb2a666b0be7..eb2a666b0be7 100644 --- a/fs/ceph/armor.c +++ b/net/ceph/armor.c diff --git a/fs/ceph/auth.c b/net/ceph/auth.c index 6d2e30600627..549c1f43e1d5 100644 --- a/fs/ceph/auth.c +++ b/net/ceph/auth.c @@ -1,16 +1,16 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/module.h> #include <linux/err.h> #include <linux/slab.h> -#include "types.h" +#include <linux/ceph/types.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/libceph.h> +#include <linux/ceph/messenger.h> #include "auth_none.h" #include "auth_x.h" -#include "decode.h" -#include "super.h" -#include "messenger.h" /* * get protocol handler diff --git a/fs/ceph/auth_none.c b/net/ceph/auth_none.c index ad1dc21286c7..214c2bb43d62 100644 --- a/fs/ceph/auth_none.c +++ b/net/ceph/auth_none.c @@ -1,14 +1,15 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/err.h> #include <linux/module.h> #include <linux/random.h> #include <linux/slab.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/auth.h> + #include "auth_none.h" -#include "auth.h" -#include "decode.h" static void reset(struct ceph_auth_client *ac) { diff --git a/fs/ceph/auth_none.h b/net/ceph/auth_none.h index 8164df1a08be..ed7d088b1bc9 100644 --- a/fs/ceph/auth_none.h +++ b/net/ceph/auth_none.h @@ -2,8 +2,7 @@ #define _FS_CEPH_AUTH_NONE_H #include <linux/slab.h> - -#include "auth.h" +#include <linux/ceph/auth.h> /* * null security mode. diff --git a/fs/ceph/auth_x.c b/net/ceph/auth_x.c index a2d002cbdec2..7fd5dfcf6e18 100644 --- a/fs/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -1,16 +1,17 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/err.h> #include <linux/module.h> #include <linux/random.h> #include <linux/slab.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/auth.h> + +#include "crypto.h" #include "auth_x.h" #include "auth_x_protocol.h" -#include "crypto.h" -#include "auth.h" -#include "decode.h" #define TEMP_TICKET_BUF_LEN 256 diff --git a/fs/ceph/auth_x.h b/net/ceph/auth_x.h index ff6f8180e681..e02da7a5c5a1 100644 --- a/fs/ceph/auth_x.h +++ b/net/ceph/auth_x.h @@ -3,8 +3,9 @@ #include <linux/rbtree.h> +#include <linux/ceph/auth.h> + #include "crypto.h" -#include "auth.h" #include "auth_x_protocol.h" /* diff --git a/fs/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h index 671d30576c4f..671d30576c4f 100644 --- a/fs/ceph/auth_x_protocol.h +++ b/net/ceph/auth_x_protocol.h diff --git a/fs/ceph/buffer.c b/net/ceph/buffer.c index cd39f17021de..53d8abfa25d5 100644 --- a/fs/ceph/buffer.c +++ b/net/ceph/buffer.c @@ -1,10 +1,11 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> +#include <linux/module.h> #include <linux/slab.h> -#include "buffer.h" -#include "decode.h" +#include <linux/ceph/buffer.h> +#include <linux/ceph/decode.h> struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) { @@ -32,6 +33,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) dout("buffer_new %p\n", b); return b; } +EXPORT_SYMBOL(ceph_buffer_new); void ceph_buffer_release(struct kref *kref) { @@ -46,6 +48,7 @@ void ceph_buffer_release(struct kref *kref) } kfree(b); } +EXPORT_SYMBOL(ceph_buffer_release); int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end) { diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c new file mode 100644 index 000000000000..f3e4a13fea0c --- /dev/null +++ b/net/ceph/ceph_common.c @@ -0,0 +1,529 @@ + +#include <linux/ceph/ceph_debug.h> +#include <linux/backing-dev.h> +#include <linux/ctype.h> +#include <linux/fs.h> +#include <linux/inet.h> +#include <linux/in6.h> +#include <linux/module.h> +#include <linux/mount.h> +#include <linux/parser.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/statfs.h> +#include <linux/string.h> + + +#include <linux/ceph/libceph.h> +#include <linux/ceph/debugfs.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/mon_client.h> +#include <linux/ceph/auth.h> + + + +/* + * find filename portion of a path (/foo/bar/baz -> baz) + */ +const char *ceph_file_part(const char *s, int len) +{ + const char *e = s + len; + + while (e != s && *(e-1) != '/') + e--; + return e; +} +EXPORT_SYMBOL(ceph_file_part); + +const char *ceph_msg_type_name(int type) +{ + switch (type) { + case CEPH_MSG_SHUTDOWN: return "shutdown"; + case CEPH_MSG_PING: return "ping"; + case CEPH_MSG_AUTH: return "auth"; + case CEPH_MSG_AUTH_REPLY: return "auth_reply"; + case CEPH_MSG_MON_MAP: return "mon_map"; + case CEPH_MSG_MON_GET_MAP: return "mon_get_map"; + case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe"; + case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack"; + case CEPH_MSG_STATFS: return "statfs"; + case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; + case CEPH_MSG_MDS_MAP: return "mds_map"; + case CEPH_MSG_CLIENT_SESSION: return "client_session"; + case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; + case CEPH_MSG_CLIENT_REQUEST: return "client_request"; + case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward"; + case CEPH_MSG_CLIENT_REPLY: return "client_reply"; + case CEPH_MSG_CLIENT_CAPS: return "client_caps"; + case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; + case CEPH_MSG_CLIENT_SNAP: return "client_snap"; + case CEPH_MSG_CLIENT_LEASE: return "client_lease"; + case CEPH_MSG_OSD_MAP: return "osd_map"; + case CEPH_MSG_OSD_OP: return "osd_op"; + case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; + default: return "unknown"; + } +} +EXPORT_SYMBOL(ceph_msg_type_name); + +/* + * Initially learn our fsid, or verify an fsid matches. + */ +int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) +{ + if (client->have_fsid) { + if (ceph_fsid_compare(&client->fsid, fsid)) { + pr_err("bad fsid, had %pU got %pU", + &client->fsid, fsid); + return -1; + } + } else { + pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid); + memcpy(&client->fsid, fsid, sizeof(*fsid)); + ceph_debugfs_client_init(client); + client->have_fsid = true; + } + return 0; +} +EXPORT_SYMBOL(ceph_check_fsid); + +static int strcmp_null(const char *s1, const char *s2) +{ + if (!s1 && !s2) + return 0; + if (s1 && !s2) + return -1; + if (!s1 && s2) + return 1; + return strcmp(s1, s2); +} + +int ceph_compare_options(struct ceph_options *new_opt, + struct ceph_client *client) +{ + struct ceph_options *opt1 = new_opt; + struct ceph_options *opt2 = client->options; + int ofs = offsetof(struct ceph_options, mon_addr); + int i; + int ret; + + ret = memcmp(opt1, opt2, ofs); + if (ret) + return ret; + + ret = strcmp_null(opt1->name, opt2->name); + if (ret) + return ret; + + ret = strcmp_null(opt1->secret, opt2->secret); + if (ret) + return ret; + + /* any matching mon ip implies a match */ + for (i = 0; i < opt1->num_mon; i++) { + if (ceph_monmap_contains(client->monc.monmap, + &opt1->mon_addr[i])) + return 0; + } + return -1; +} +EXPORT_SYMBOL(ceph_compare_options); + + +static int parse_fsid(const char *str, struct ceph_fsid *fsid) +{ + int i = 0; + char tmp[3]; + int err = -EINVAL; + int d; + + dout("parse_fsid '%s'\n", str); + tmp[2] = 0; + while (*str && i < 16) { + if (ispunct(*str)) { + str++; + continue; + } + if (!isxdigit(str[0]) || !isxdigit(str[1])) + break; + tmp[0] = str[0]; + tmp[1] = str[1]; + if (sscanf(tmp, "%x", &d) < 1) + break; + fsid->fsid[i] = d & 0xff; + i++; + str += 2; + } + + if (i == 16) + err = 0; + dout("parse_fsid ret %d got fsid %pU", err, fsid); + return err; +} + +/* + * ceph options + */ +enum { + Opt_osdtimeout, + Opt_osdkeepalivetimeout, + Opt_mount_timeout, + Opt_osd_idle_ttl, + Opt_last_int, + /* int args above */ + Opt_fsid, + Opt_name, + Opt_secret, + Opt_ip, + Opt_last_string, + /* string args above */ + Opt_noshare, + Opt_nocrc, +}; + +static match_table_t opt_tokens = { + {Opt_osdtimeout, "osdtimeout=%d"}, + {Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, + {Opt_mount_timeout, "mount_timeout=%d"}, + {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, + /* int args above */ + {Opt_fsid, "fsid=%s"}, + {Opt_name, "name=%s"}, + {Opt_secret, "secret=%s"}, + {Opt_ip, "ip=%s"}, + /* string args above */ + {Opt_noshare, "noshare"}, + {Opt_nocrc, "nocrc"}, + {-1, NULL} +}; + +void ceph_destroy_options(struct ceph_options *opt) +{ + dout("destroy_options %p\n", opt); + kfree(opt->name); + kfree(opt->secret); + kfree(opt); +} +EXPORT_SYMBOL(ceph_destroy_options); + +int ceph_parse_options(struct ceph_options **popt, char *options, + const char *dev_name, const char *dev_name_end, + int (*parse_extra_token)(char *c, void *private), + void *private) +{ + struct ceph_options *opt; + const char *c; + int err = -ENOMEM; + substring_t argstr[MAX_OPT_ARGS]; + + opt = kzalloc(sizeof(*opt), GFP_KERNEL); + if (!opt) + return err; + opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), + GFP_KERNEL); + if (!opt->mon_addr) + goto out; + + dout("parse_options %p options '%s' dev_name '%s'\n", opt, options, + dev_name); + + /* start with defaults */ + opt->flags = CEPH_OPT_DEFAULT; + opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; + opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; + opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ + opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ + + /* get mon ip(s) */ + /* ip1[:port1][,ip2[:port2]...] */ + err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr, + CEPH_MAX_MON, &opt->num_mon); + if (err < 0) + goto out; + + /* parse mount options */ + while ((c = strsep(&options, ",")) != NULL) { + int token, intval, ret; + if (!*c) + continue; + err = -EINVAL; + token = match_token((char *)c, opt_tokens, argstr); + if (token < 0 && parse_extra_token) { + /* extra? */ + err = parse_extra_token((char *)c, private); + if (err < 0) { + pr_err("bad option at '%s'\n", c); + goto out; + } + continue; + } + if (token < Opt_last_int) { + ret = match_int(&argstr[0], &intval); + if (ret < 0) { + pr_err("bad mount option arg (not int) " + "at '%s'\n", c); + continue; + } + dout("got int token %d val %d\n", token, intval); + } else if (token > Opt_last_int && token < Opt_last_string) { + dout("got string token %d val %s\n", token, + argstr[0].from); + } else { + dout("got token %d\n", token); + } + switch (token) { + case Opt_ip: + err = ceph_parse_ips(argstr[0].from, + argstr[0].to, + &opt->my_addr, + 1, NULL); + if (err < 0) + goto out; + opt->flags |= CEPH_OPT_MYIP; + break; + + case Opt_fsid: + err = parse_fsid(argstr[0].from, &opt->fsid); + if (err == 0) + opt->flags |= CEPH_OPT_FSID; + break; + case Opt_name: + opt->name = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + break; + case Opt_secret: + opt->secret = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + break; + + /* misc */ + case Opt_osdtimeout: + opt->osd_timeout = intval; + break; + case Opt_osdkeepalivetimeout: + opt->osd_keepalive_timeout = intval; + break; + case Opt_osd_idle_ttl: + opt->osd_idle_ttl = intval; + break; + case Opt_mount_timeout: + opt->mount_timeout = intval; + break; + + case Opt_noshare: + opt->flags |= CEPH_OPT_NOSHARE; + break; + + case Opt_nocrc: + opt->flags |= CEPH_OPT_NOCRC; + break; + + default: + BUG_ON(token); + } + } + + /* success */ + *popt = opt; + return 0; + +out: + ceph_destroy_options(opt); + return err; +} +EXPORT_SYMBOL(ceph_parse_options); + +u64 ceph_client_id(struct ceph_client *client) +{ + return client->monc.auth->global_id; +} +EXPORT_SYMBOL(ceph_client_id); + +/* + * create a fresh client instance + */ +struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private) +{ + struct ceph_client *client; + int err = -ENOMEM; + + client = kzalloc(sizeof(*client), GFP_KERNEL); + if (client == NULL) + return ERR_PTR(-ENOMEM); + + client->private = private; + client->options = opt; + + mutex_init(&client->mount_mutex); + init_waitqueue_head(&client->auth_wq); + client->auth_err = 0; + + client->extra_mon_dispatch = NULL; + client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT; + client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT; + + client->msgr = NULL; + + /* subsystems */ + err = ceph_monc_init(&client->monc, client); + if (err < 0) + goto fail; + err = ceph_osdc_init(&client->osdc, client); + if (err < 0) + goto fail_monc; + + return client; + +fail_monc: + ceph_monc_stop(&client->monc); +fail: + kfree(client); + return ERR_PTR(err); +} +EXPORT_SYMBOL(ceph_create_client); + +void ceph_destroy_client(struct ceph_client *client) +{ + dout("destroy_client %p\n", client); + + /* unmount */ + ceph_osdc_stop(&client->osdc); + + /* + * make sure mds and osd connections close out before destroying + * the auth module, which is needed to free those connections' + * ceph_authorizers. + */ + ceph_msgr_flush(); + + ceph_monc_stop(&client->monc); + + ceph_debugfs_client_cleanup(client); + + if (client->msgr) + ceph_messenger_destroy(client->msgr); + + ceph_destroy_options(client->options); + + kfree(client); + dout("destroy_client %p done\n", client); +} +EXPORT_SYMBOL(ceph_destroy_client); + +/* + * true if we have the mon map (and have thus joined the cluster) + */ +static int have_mon_and_osd_map(struct ceph_client *client) +{ + return client->monc.monmap && client->monc.monmap->epoch && + client->osdc.osdmap && client->osdc.osdmap->epoch; +} + +/* + * mount: join the ceph cluster, and open root directory. + */ +int __ceph_open_session(struct ceph_client *client, unsigned long started) +{ + struct ceph_entity_addr *myaddr = NULL; + int err; + unsigned long timeout = client->options->mount_timeout * HZ; + + /* initialize the messenger */ + if (client->msgr == NULL) { + if (ceph_test_opt(client, MYIP)) + myaddr = &client->options->my_addr; + client->msgr = ceph_messenger_create(myaddr, + client->supported_features, + client->required_features); + if (IS_ERR(client->msgr)) { + client->msgr = NULL; + return PTR_ERR(client->msgr); + } + client->msgr->nocrc = ceph_test_opt(client, NOCRC); + } + + /* open session, and wait for mon and osd maps */ + err = ceph_monc_open_session(&client->monc); + if (err < 0) + return err; + + while (!have_mon_and_osd_map(client)) { + err = -EIO; + if (timeout && time_after_eq(jiffies, started + timeout)) + return err; + + /* wait */ + dout("mount waiting for mon_map\n"); + err = wait_event_interruptible_timeout(client->auth_wq, + have_mon_and_osd_map(client) || (client->auth_err < 0), + timeout); + if (err == -EINTR || err == -ERESTARTSYS) + return err; + if (client->auth_err < 0) + return client->auth_err; + } + + return 0; +} +EXPORT_SYMBOL(__ceph_open_session); + + +int ceph_open_session(struct ceph_client *client) +{ + int ret; + unsigned long started = jiffies; /* note the start time */ + + dout("open_session start\n"); + mutex_lock(&client->mount_mutex); + + ret = __ceph_open_session(client, started); + + mutex_unlock(&client->mount_mutex); + return ret; +} +EXPORT_SYMBOL(ceph_open_session); + + +static int __init init_ceph_lib(void) +{ + int ret = 0; + + ret = ceph_debugfs_init(); + if (ret < 0) + goto out; + + ret = ceph_msgr_init(); + if (ret < 0) + goto out_debugfs; + + pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n", + CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL, + CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, + CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); + + return 0; + +out_debugfs: + ceph_debugfs_cleanup(); +out: + return ret; +} + +static void __exit exit_ceph_lib(void) +{ + dout("exit_ceph_lib\n"); + ceph_msgr_exit(); + ceph_debugfs_cleanup(); +} + +module_init(init_ceph_lib); +module_exit(exit_ceph_lib); + +MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); +MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); +MODULE_AUTHOR("Patience Warnick <patience@newdream.net>"); +MODULE_DESCRIPTION("Ceph filesystem for Linux"); +MODULE_LICENSE("GPL"); diff --git a/fs/ceph/ceph_fs.c b/net/ceph/ceph_fs.c index 3ac6cc7c1156..a3a3a31d3c37 100644 --- a/fs/ceph/ceph_fs.c +++ b/net/ceph/ceph_fs.c @@ -1,7 +1,8 @@ /* * Some non-inline ceph helpers */ -#include "types.h" +#include <linux/module.h> +#include <linux/ceph/types.h> /* * return true if @layout appears to be valid @@ -52,6 +53,7 @@ int ceph_flags_to_mode(int flags) return mode; } +EXPORT_SYMBOL(ceph_flags_to_mode); int ceph_caps_for_mode(int mode) { @@ -70,3 +72,4 @@ int ceph_caps_for_mode(int mode) return caps; } +EXPORT_SYMBOL(ceph_caps_for_mode); diff --git a/fs/ceph/ceph_hash.c b/net/ceph/ceph_hash.c index bd570015d147..815ef8826796 100644 --- a/fs/ceph/ceph_hash.c +++ b/net/ceph/ceph_hash.c @@ -1,5 +1,5 @@ -#include "types.h" +#include <linux/ceph/types.h> /* * Robert Jenkin's hash function. diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c new file mode 100644 index 000000000000..3fbda04de29c --- /dev/null +++ b/net/ceph/ceph_strings.c @@ -0,0 +1,84 @@ +/* + * Ceph string constants + */ +#include <linux/module.h> +#include <linux/ceph/types.h> + +const char *ceph_entity_type_name(int type) +{ + switch (type) { + case CEPH_ENTITY_TYPE_MDS: return "mds"; + case CEPH_ENTITY_TYPE_OSD: return "osd"; + case CEPH_ENTITY_TYPE_MON: return "mon"; + case CEPH_ENTITY_TYPE_CLIENT: return "client"; + case CEPH_ENTITY_TYPE_AUTH: return "auth"; + default: return "unknown"; + } +} + +const char *ceph_osd_op_name(int op) +{ + switch (op) { + case CEPH_OSD_OP_READ: return "read"; + case CEPH_OSD_OP_STAT: return "stat"; + + case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; + + case CEPH_OSD_OP_WRITE: return "write"; + case CEPH_OSD_OP_DELETE: return "delete"; + case CEPH_OSD_OP_TRUNCATE: return "truncate"; + case CEPH_OSD_OP_ZERO: return "zero"; + case CEPH_OSD_OP_WRITEFULL: return "writefull"; + case CEPH_OSD_OP_ROLLBACK: return "rollback"; + + case CEPH_OSD_OP_APPEND: return "append"; + case CEPH_OSD_OP_STARTSYNC: return "startsync"; + case CEPH_OSD_OP_SETTRUNC: return "settrunc"; + case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc"; + + case CEPH_OSD_OP_TMAPUP: return "tmapup"; + case CEPH_OSD_OP_TMAPGET: return "tmapget"; + case CEPH_OSD_OP_TMAPPUT: return "tmapput"; + + case CEPH_OSD_OP_GETXATTR: return "getxattr"; + case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; + case CEPH_OSD_OP_SETXATTR: return "setxattr"; + case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; + case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; + case CEPH_OSD_OP_RMXATTR: return "rmxattr"; + case CEPH_OSD_OP_CMPXATTR: return "cmpxattr"; + + case CEPH_OSD_OP_PULL: return "pull"; + case CEPH_OSD_OP_PUSH: return "push"; + case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; + case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; + case CEPH_OSD_OP_SCRUB: return "scrub"; + + case CEPH_OSD_OP_WRLOCK: return "wrlock"; + case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; + case CEPH_OSD_OP_RDLOCK: return "rdlock"; + case CEPH_OSD_OP_RDUNLOCK: return "rdunlock"; + case CEPH_OSD_OP_UPLOCK: return "uplock"; + case CEPH_OSD_OP_DNLOCK: return "dnlock"; + + case CEPH_OSD_OP_CALL: return "call"; + + case CEPH_OSD_OP_PGLS: return "pgls"; + } + return "???"; +} + + +const char *ceph_pool_op_name(int op) +{ + switch (op) { + case POOL_OP_CREATE: return "create"; + case POOL_OP_DELETE: return "delete"; + case POOL_OP_AUID_CHANGE: return "auid change"; + case POOL_OP_CREATE_SNAP: return "create snap"; + case POOL_OP_DELETE_SNAP: return "delete snap"; + case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap"; + case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap"; + } + return "???"; +} diff --git a/fs/ceph/crush/crush.c b/net/ceph/crush/crush.c index fabd302e5779..d6ebb13a18a4 100644 --- a/fs/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -8,7 +8,7 @@ # define BUG_ON(x) assert(!(x)) #endif -#include "crush.h" +#include <linux/crush/crush.h> const char *crush_bucket_alg_name(int alg) { diff --git a/fs/ceph/crush/hash.c b/net/ceph/crush/hash.c index 5873aed694bf..5bb63e37a8a1 100644 --- a/fs/ceph/crush/hash.c +++ b/net/ceph/crush/hash.c @@ -1,6 +1,6 @@ #include <linux/types.h> -#include "hash.h" +#include <linux/crush/hash.h> /* * Robert Jenkins' function for mixing 32-bit values diff --git a/fs/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index a4eec133258e..42599e31dcad 100644 --- a/fs/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -18,8 +18,8 @@ # define kfree(x) free(x) #endif -#include "crush.h" -#include "hash.h" +#include <linux/crush/crush.h> +#include <linux/crush/hash.h> /* * Implement the core CRUSH mapping algorithm. diff --git a/fs/ceph/crypto.c b/net/ceph/crypto.c index a3e627f63293..7b505b0c983f 100644 --- a/fs/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -1,13 +1,13 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/err.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <crypto/hash.h> +#include <linux/ceph/decode.h> #include "crypto.h" -#include "decode.h" int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end) { diff --git a/fs/ceph/crypto.h b/net/ceph/crypto.h index bdf38607323c..f9eccace592b 100644 --- a/fs/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -1,8 +1,8 @@ #ifndef _FS_CEPH_CRYPTO_H #define _FS_CEPH_CRYPTO_H -#include "types.h" -#include "buffer.h" +#include <linux/ceph/types.h> +#include <linux/ceph/buffer.h> /* * cryptographic secret diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c new file mode 100644 index 000000000000..27d4ea315d12 --- /dev/null +++ b/net/ceph/debugfs.c @@ -0,0 +1,267 @@ +#include <linux/ceph/ceph_debug.h> + +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#include <linux/ceph/libceph.h> +#include <linux/ceph/mon_client.h> +#include <linux/ceph/auth.h> +#include <linux/ceph/debugfs.h> + +#ifdef CONFIG_DEBUG_FS + +/* + * Implement /sys/kernel/debug/ceph fun + * + * /sys/kernel/debug/ceph/client* - an instance of the ceph client + * .../osdmap - current osdmap + * .../monmap - current monmap + * .../osdc - active osd requests + * .../monc - mon client state + * .../dentry_lru - dump contents of dentry lru + * .../caps - expose cap (reservation) stats + * .../bdi - symlink to ../../bdi/something + */ + +static struct dentry *ceph_debugfs_dir; + +static int monmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_client *client = s->private; + + if (client->monc.monmap == NULL) + return 0; + + seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); + for (i = 0; i < client->monc.monmap->num_mon; i++) { + struct ceph_entity_inst *inst = + &client->monc.monmap->mon_inst[i]; + + seq_printf(s, "\t%s%lld\t%s\n", + ENTITY_NAME(inst->name), + ceph_pr_addr(&inst->addr.in_addr)); + } + return 0; +} + +static int osdmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_client *client = s->private; + struct rb_node *n; + + if (client->osdc.osdmap == NULL) + return 0; + seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch); + seq_printf(s, "flags%s%s\n", + (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? + " NEARFULL" : "", + (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? + " FULL" : ""); + for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { + struct ceph_pg_pool_info *pool = + rb_entry(n, struct ceph_pg_pool_info, node); + seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", + pool->id, pool->v.pg_num, pool->pg_num_mask, + pool->v.lpg_num, pool->lpg_num_mask); + } + for (i = 0; i < client->osdc.osdmap->max_osd; i++) { + struct ceph_entity_addr *addr = + &client->osdc.osdmap->osd_addr[i]; + int state = client->osdc.osdmap->osd_state[i]; + char sb[64]; + + seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n", + i, ceph_pr_addr(&addr->in_addr), + ((client->osdc.osdmap->osd_weight[i]*100) >> 16), + ceph_osdmap_state_str(sb, sizeof(sb), state)); + } + return 0; +} + +static int monc_show(struct seq_file *s, void *p) +{ + struct ceph_client *client = s->private; + struct ceph_mon_generic_request *req; + struct ceph_mon_client *monc = &client->monc; + struct rb_node *rp; + + mutex_lock(&monc->mutex); + + if (monc->have_mdsmap) + seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap); + if (monc->have_osdmap) + seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap); + if (monc->want_next_osdmap) + seq_printf(s, "want next osdmap\n"); + + for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { + __u16 op; + req = rb_entry(rp, struct ceph_mon_generic_request, node); + op = le16_to_cpu(req->request->hdr.type); + if (op == CEPH_MSG_STATFS) + seq_printf(s, "%lld statfs\n", req->tid); + else + seq_printf(s, "%lld unknown\n", req->tid); + } + + mutex_unlock(&monc->mutex); + return 0; +} + +static int osdc_show(struct seq_file *s, void *pp) +{ + struct ceph_client *client = s->private; + struct ceph_osd_client *osdc = &client->osdc; + struct rb_node *p; + + mutex_lock(&osdc->request_mutex); + for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { + struct ceph_osd_request *req; + struct ceph_osd_request_head *head; + struct ceph_osd_op *op; + int num_ops; + int opcode, olen; + int i; + + req = rb_entry(p, struct ceph_osd_request, r_node); + + seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1, + le32_to_cpu(req->r_pgid.pool), + le16_to_cpu(req->r_pgid.ps)); + + head = req->r_request->front.iov_base; + op = (void *)(head + 1); + + num_ops = le16_to_cpu(head->num_ops); + olen = le32_to_cpu(head->object_len); + seq_printf(s, "%.*s", olen, + (const char *)(head->ops + num_ops)); + + if (req->r_reassert_version.epoch) + seq_printf(s, "\t%u'%llu", + (unsigned)le32_to_cpu(req->r_reassert_version.epoch), + le64_to_cpu(req->r_reassert_version.version)); + else + seq_printf(s, "\t"); + + for (i = 0; i < num_ops; i++) { + opcode = le16_to_cpu(op->op); + seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); + op++; + } + + seq_printf(s, "\n"); + } + mutex_unlock(&osdc->request_mutex); + return 0; +} + +CEPH_DEFINE_SHOW_FUNC(monmap_show) +CEPH_DEFINE_SHOW_FUNC(osdmap_show) +CEPH_DEFINE_SHOW_FUNC(monc_show) +CEPH_DEFINE_SHOW_FUNC(osdc_show) + +int ceph_debugfs_init(void) +{ + ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); + if (!ceph_debugfs_dir) + return -ENOMEM; + return 0; +} + +void ceph_debugfs_cleanup(void) +{ + debugfs_remove(ceph_debugfs_dir); +} + +int ceph_debugfs_client_init(struct ceph_client *client) +{ + int ret = -ENOMEM; + char name[80]; + + snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, + client->monc.auth->global_id); + + client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); + if (!client->debugfs_dir) + goto out; + + client->monc.debugfs_file = debugfs_create_file("monc", + 0600, + client->debugfs_dir, + client, + &monc_show_fops); + if (!client->monc.debugfs_file) + goto out; + + client->osdc.debugfs_file = debugfs_create_file("osdc", + 0600, + client->debugfs_dir, + client, + &osdc_show_fops); + if (!client->osdc.debugfs_file) + goto out; + + client->debugfs_monmap = debugfs_create_file("monmap", + 0600, + client->debugfs_dir, + client, + &monmap_show_fops); + if (!client->debugfs_monmap) + goto out; + + client->debugfs_osdmap = debugfs_create_file("osdmap", + 0600, + client->debugfs_dir, + client, + &osdmap_show_fops); + if (!client->debugfs_osdmap) + goto out; + + return 0; + +out: + ceph_debugfs_client_cleanup(client); + return ret; +} + +void ceph_debugfs_client_cleanup(struct ceph_client *client) +{ + debugfs_remove(client->debugfs_osdmap); + debugfs_remove(client->debugfs_monmap); + debugfs_remove(client->osdc.debugfs_file); + debugfs_remove(client->monc.debugfs_file); + debugfs_remove(client->debugfs_dir); +} + +#else /* CONFIG_DEBUG_FS */ + +int ceph_debugfs_init(void) +{ + return 0; +} + +void ceph_debugfs_cleanup(void) +{ +} + +int ceph_debugfs_client_init(struct ceph_client *client) +{ + return 0; +} + +void ceph_debugfs_client_cleanup(struct ceph_client *client) +{ +} + +#endif /* CONFIG_DEBUG_FS */ + +EXPORT_SYMBOL(ceph_debugfs_init); +EXPORT_SYMBOL(ceph_debugfs_cleanup); diff --git a/fs/ceph/messenger.c b/net/ceph/messenger.c index 2502d76fcec1..0e8157ee5d43 100644 --- a/fs/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/crc32c.h> #include <linux/ctype.h> @@ -9,12 +9,14 @@ #include <linux/slab.h> #include <linux/socket.h> #include <linux/string.h> +#include <linux/bio.h> +#include <linux/blkdev.h> #include <net/tcp.h> -#include "super.h" -#include "messenger.h" -#include "decode.h" -#include "pagelist.h" +#include <linux/ceph/libceph.h> +#include <linux/ceph/messenger.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/pagelist.h> /* * Ceph uses the messenger to exchange ceph_msg messages with other @@ -48,7 +50,7 @@ static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; static DEFINE_SPINLOCK(addr_str_lock); static int last_addr_str; -const char *pr_addr(const struct sockaddr_storage *ss) +const char *ceph_pr_addr(const struct sockaddr_storage *ss) { int i; char *s; @@ -79,6 +81,7 @@ const char *pr_addr(const struct sockaddr_storage *ss) return s; } +EXPORT_SYMBOL(ceph_pr_addr); static void encode_my_addr(struct ceph_messenger *msgr) { @@ -91,7 +94,7 @@ static void encode_my_addr(struct ceph_messenger *msgr) */ struct workqueue_struct *ceph_msgr_wq; -int __init ceph_msgr_init(void) +int ceph_msgr_init(void) { ceph_msgr_wq = create_workqueue("ceph-msgr"); if (IS_ERR(ceph_msgr_wq)) { @@ -102,16 +105,19 @@ int __init ceph_msgr_init(void) } return 0; } +EXPORT_SYMBOL(ceph_msgr_init); void ceph_msgr_exit(void) { destroy_workqueue(ceph_msgr_wq); } +EXPORT_SYMBOL(ceph_msgr_exit); void ceph_msgr_flush(void) { flush_workqueue(ceph_msgr_wq); } +EXPORT_SYMBOL(ceph_msgr_flush); /* @@ -221,19 +227,19 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) set_sock_callbacks(sock, con); - dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); + dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr)); ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", - pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr.in_addr), sock->sk->sk_state); ret = 0; } if (ret < 0) { pr_err("connect %s error %d\n", - pr_addr(&con->peer_addr.in_addr), ret); + ceph_pr_addr(&con->peer_addr.in_addr), ret); sock_release(sock); con->sock = NULL; con->error_msg = "connect error"; @@ -334,7 +340,8 @@ static void reset_connection(struct ceph_connection *con) */ void ceph_con_close(struct ceph_connection *con) { - dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr)); + dout("con_close %p peer %s\n", con, + ceph_pr_addr(&con->peer_addr.in_addr)); set_bit(CLOSED, &con->state); /* in case there's queued work */ clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ clear_bit(LOSSYTX, &con->state); /* so we retry next connect */ @@ -347,19 +354,21 @@ void ceph_con_close(struct ceph_connection *con) mutex_unlock(&con->mutex); queue_con(con); } +EXPORT_SYMBOL(ceph_con_close); /* * Reopen a closed connection, with a new peer address. */ void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) { - dout("con_open %p %s\n", con, pr_addr(&addr->in_addr)); + dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); set_bit(OPENING, &con->state); clear_bit(CLOSED, &con->state); memcpy(&con->peer_addr, addr, sizeof(*addr)); con->delay = 0; /* reset backoff memory */ queue_con(con); } +EXPORT_SYMBOL(ceph_con_open); /* * return true if this connection ever successfully opened @@ -406,6 +415,7 @@ void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con) INIT_LIST_HEAD(&con->out_sent); INIT_DELAYED_WORK(&con->work, con_work); } +EXPORT_SYMBOL(ceph_con_init); /* @@ -529,8 +539,11 @@ static void prepare_write_message(struct ceph_connection *con) if (le32_to_cpu(m->hdr.data_len) > 0) { /* initialize page iterator */ con->out_msg_pos.page = 0; - con->out_msg_pos.page_pos = - le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK; + if (m->pages) + con->out_msg_pos.page_pos = + le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK; + else + con->out_msg_pos.page_pos = 0; con->out_msg_pos.data_pos = 0; con->out_msg_pos.did_page_crc = 0; con->out_more = 1; /* data + footer will follow */ @@ -647,7 +660,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); - con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED); + con->out_connect.features = cpu_to_le64(msgr->supported_features); con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); @@ -712,6 +725,31 @@ out: return ret; /* done! */ } +#ifdef CONFIG_BLOCK +static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) +{ + if (!bio) { + *iter = NULL; + *seg = 0; + return; + } + *iter = bio; + *seg = bio->bi_idx; +} + +static void iter_bio_next(struct bio **bio_iter, int *seg) +{ + if (*bio_iter == NULL) + return; + + BUG_ON(*seg >= (*bio_iter)->bi_vcnt); + + (*seg)++; + if (*seg == (*bio_iter)->bi_vcnt) + init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); +} +#endif + /* * Write as much message data payload as we can. If we finish, queue * up the footer. @@ -726,21 +764,46 @@ static int write_partial_msg_pages(struct ceph_connection *con) size_t len; int crc = con->msgr->nocrc; int ret; + int total_max_write; + int in_trail = 0; + size_t trail_len = (msg->trail ? msg->trail->length : 0); dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, con->out_msg_pos.page_pos); - while (con->out_msg_pos.page < con->out_msg->nr_pages) { +#ifdef CONFIG_BLOCK + if (msg->bio && !msg->bio_iter) + init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); +#endif + + while (data_len > con->out_msg_pos.data_pos) { struct page *page = NULL; void *kaddr = NULL; + int max_write = PAGE_SIZE; + int page_shift = 0; + + total_max_write = data_len - trail_len - + con->out_msg_pos.data_pos; /* * if we are calculating the data crc (the default), we need * to map the page. if our pages[] has been revoked, use the * zero page. */ - if (msg->pages) { + + /* have we reached the trail part of the data? */ + if (con->out_msg_pos.data_pos >= data_len - trail_len) { + in_trail = 1; + + total_max_write = data_len - con->out_msg_pos.data_pos; + + page = list_first_entry(&msg->trail->head, + struct page, lru); + if (crc) + kaddr = kmap(page); + max_write = PAGE_SIZE; + } else if (msg->pages) { page = msg->pages[con->out_msg_pos.page]; if (crc) kaddr = kmap(page); @@ -749,13 +812,25 @@ static int write_partial_msg_pages(struct ceph_connection *con) struct page, lru); if (crc) kaddr = kmap(page); +#ifdef CONFIG_BLOCK + } else if (msg->bio) { + struct bio_vec *bv; + + bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); + page = bv->bv_page; + page_shift = bv->bv_offset; + if (crc) + kaddr = kmap(page) + page_shift; + max_write = bv->bv_len; +#endif } else { page = con->msgr->zero_page; if (crc) kaddr = page_address(con->msgr->zero_page); } - len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos), - (int)(data_len - con->out_msg_pos.data_pos)); + len = min_t(int, max_write - con->out_msg_pos.page_pos, + total_max_write); + if (crc && !con->out_msg_pos.did_page_crc) { void *base = kaddr + con->out_msg_pos.page_pos; u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); @@ -765,13 +840,14 @@ static int write_partial_msg_pages(struct ceph_connection *con) cpu_to_le32(crc32c(tmpcrc, base, len)); con->out_msg_pos.did_page_crc = 1; } - ret = kernel_sendpage(con->sock, page, - con->out_msg_pos.page_pos, len, + con->out_msg_pos.page_pos + page_shift, + len, MSG_DONTWAIT | MSG_NOSIGNAL | MSG_MORE); - if (crc && (msg->pages || msg->pagelist)) + if (crc && + (msg->pages || msg->pagelist || msg->bio || in_trail)) kunmap(page); if (ret <= 0) @@ -783,9 +859,16 @@ static int write_partial_msg_pages(struct ceph_connection *con) con->out_msg_pos.page_pos = 0; con->out_msg_pos.page++; con->out_msg_pos.did_page_crc = 0; - if (msg->pagelist) + if (in_trail) + list_move_tail(&page->lru, + &msg->trail->head); + else if (msg->pagelist) list_move_tail(&page->lru, &msg->pagelist->head); +#ifdef CONFIG_BLOCK + else if (msg->bio) + iter_bio_next(&msg->bio_iter, &msg->bio_seg); +#endif } } @@ -938,7 +1021,7 @@ static int verify_hello(struct ceph_connection *con) { if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { pr_err("connect to %s got bad banner\n", - pr_addr(&con->peer_addr.in_addr)); + ceph_pr_addr(&con->peer_addr.in_addr)); con->error_msg = "protocol error, bad banner"; return -1; } @@ -1041,7 +1124,7 @@ int ceph_parse_ips(const char *c, const char *end, addr_set_port(ss, port); - dout("parse_ips got %s\n", pr_addr(ss)); + dout("parse_ips got %s\n", ceph_pr_addr(ss)); if (p == end) break; @@ -1061,6 +1144,7 @@ bad: pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); return -EINVAL; } +EXPORT_SYMBOL(ceph_parse_ips); static int process_banner(struct ceph_connection *con) { @@ -1082,9 +1166,9 @@ static int process_banner(struct ceph_connection *con) !(addr_is_blank(&con->actual_peer_addr.in_addr) && con->actual_peer_addr.nonce == con->peer_addr.nonce)) { pr_warning("wrong peer, want %s/%d, got %s/%d\n", - pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr.in_addr), (int)le32_to_cpu(con->peer_addr.nonce), - pr_addr(&con->actual_peer_addr.in_addr), + ceph_pr_addr(&con->actual_peer_addr.in_addr), (int)le32_to_cpu(con->actual_peer_addr.nonce)); con->error_msg = "wrong peer at address"; return -1; @@ -1102,7 +1186,7 @@ static int process_banner(struct ceph_connection *con) addr_set_port(&con->msgr->inst.addr.in_addr, port); encode_my_addr(con->msgr); dout("process_banner learned my addr is %s\n", - pr_addr(&con->msgr->inst.addr.in_addr)); + ceph_pr_addr(&con->msgr->inst.addr.in_addr)); } set_bit(NEGOTIATING, &con->state); @@ -1123,8 +1207,8 @@ static void fail_protocol(struct ceph_connection *con) static int process_connect(struct ceph_connection *con) { - u64 sup_feat = CEPH_FEATURE_SUPPORTED; - u64 req_feat = CEPH_FEATURE_REQUIRED; + u64 sup_feat = con->msgr->supported_features; + u64 req_feat = con->msgr->required_features; u64 server_feat = le64_to_cpu(con->in_reply.features); dout("process_connect on %p tag %d\n", con, (int)con->in_tag); @@ -1134,7 +1218,7 @@ static int process_connect(struct ceph_connection *con) pr_err("%s%lld %s feature set mismatch," " my %llx < server's %llx, missing %llx\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr.in_addr), sup_feat, server_feat, server_feat & ~sup_feat); con->error_msg = "missing required protocol features"; fail_protocol(con); @@ -1144,7 +1228,7 @@ static int process_connect(struct ceph_connection *con) pr_err("%s%lld %s protocol version mismatch," " my %d != server's %d\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr.in_addr), le32_to_cpu(con->out_connect.protocol_version), le32_to_cpu(con->in_reply.protocol_version)); con->error_msg = "protocol version mismatch"; @@ -1178,7 +1262,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->in_connect.connect_seq)); pr_err("%s%lld %s connection reset\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr)); + ceph_pr_addr(&con->peer_addr.in_addr)); reset_connection(con); prepare_write_connect(con->msgr, con, 0); prepare_read_connect(con); @@ -1223,7 +1307,7 @@ static int process_connect(struct ceph_connection *con) pr_err("%s%lld %s protocol feature mismatch," " my required %llx > server's %llx, need %llx\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr.in_addr), req_feat, server_feat, req_feat & ~server_feat); con->error_msg = "missing required protocol features"; fail_protocol(con); @@ -1305,8 +1389,7 @@ static int read_partial_message_section(struct ceph_connection *con, struct kvec *section, unsigned int sec_len, u32 *crc) { - int left; - int ret; + int ret, left; BUG_ON(!section); @@ -1329,13 +1412,83 @@ static int read_partial_message_section(struct ceph_connection *con, static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip); + + +static int read_partial_message_pages(struct ceph_connection *con, + struct page **pages, + unsigned data_len, int datacrc) +{ + void *p; + int ret; + int left; + + left = min((int)(data_len - con->in_msg_pos.data_pos), + (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); + /* (page) data */ + BUG_ON(pages == NULL); + p = kmap(pages[con->in_msg_pos.page]); + ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, + left); + if (ret > 0 && datacrc) + con->in_data_crc = + crc32c(con->in_data_crc, + p + con->in_msg_pos.page_pos, ret); + kunmap(pages[con->in_msg_pos.page]); + if (ret <= 0) + return ret; + con->in_msg_pos.data_pos += ret; + con->in_msg_pos.page_pos += ret; + if (con->in_msg_pos.page_pos == PAGE_SIZE) { + con->in_msg_pos.page_pos = 0; + con->in_msg_pos.page++; + } + + return ret; +} + +#ifdef CONFIG_BLOCK +static int read_partial_message_bio(struct ceph_connection *con, + struct bio **bio_iter, int *bio_seg, + unsigned data_len, int datacrc) +{ + struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg); + void *p; + int ret, left; + + if (IS_ERR(bv)) + return PTR_ERR(bv); + + left = min((int)(data_len - con->in_msg_pos.data_pos), + (int)(bv->bv_len - con->in_msg_pos.page_pos)); + + p = kmap(bv->bv_page) + bv->bv_offset; + + ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, + left); + if (ret > 0 && datacrc) + con->in_data_crc = + crc32c(con->in_data_crc, + p + con->in_msg_pos.page_pos, ret); + kunmap(bv->bv_page); + if (ret <= 0) + return ret; + con->in_msg_pos.data_pos += ret; + con->in_msg_pos.page_pos += ret; + if (con->in_msg_pos.page_pos == bv->bv_len) { + con->in_msg_pos.page_pos = 0; + iter_bio_next(bio_iter, bio_seg); + } + + return ret; +} +#endif + /* * read (part of) a message. */ static int read_partial_message(struct ceph_connection *con) { struct ceph_msg *m = con->in_msg; - void *p; int ret; int to, left; unsigned front_len, middle_len, data_len, data_off; @@ -1381,7 +1534,7 @@ static int read_partial_message(struct ceph_connection *con) if ((s64)seq - (s64)con->in_seq < 1) { pr_info("skipping %s%lld %s seq %lld, expected %lld\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr.in_addr), seq, con->in_seq + 1); con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); @@ -1422,7 +1575,10 @@ static int read_partial_message(struct ceph_connection *con) m->middle->vec.iov_len = 0; con->in_msg_pos.page = 0; - con->in_msg_pos.page_pos = data_off & ~PAGE_MASK; + if (m->pages) + con->in_msg_pos.page_pos = data_off & ~PAGE_MASK; + else + con->in_msg_pos.page_pos = 0; con->in_msg_pos.data_pos = 0; } @@ -1440,27 +1596,29 @@ static int read_partial_message(struct ceph_connection *con) if (ret <= 0) return ret; } +#ifdef CONFIG_BLOCK + if (m->bio && !m->bio_iter) + init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); +#endif /* (page) data */ while (con->in_msg_pos.data_pos < data_len) { - left = min((int)(data_len - con->in_msg_pos.data_pos), - (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); - BUG_ON(m->pages == NULL); - p = kmap(m->pages[con->in_msg_pos.page]); - ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, - left); - if (ret > 0 && datacrc) - con->in_data_crc = - crc32c(con->in_data_crc, - p + con->in_msg_pos.page_pos, ret); - kunmap(m->pages[con->in_msg_pos.page]); - if (ret <= 0) - return ret; - con->in_msg_pos.data_pos += ret; - con->in_msg_pos.page_pos += ret; - if (con->in_msg_pos.page_pos == PAGE_SIZE) { - con->in_msg_pos.page_pos = 0; - con->in_msg_pos.page++; + if (m->pages) { + ret = read_partial_message_pages(con, m->pages, + data_len, datacrc); + if (ret <= 0) + return ret; +#ifdef CONFIG_BLOCK + } else if (m->bio) { + + ret = read_partial_message_bio(con, + &m->bio_iter, &m->bio_seg, + data_len, datacrc); + if (ret <= 0) + return ret; +#endif + } else { + BUG_ON(1); } } @@ -1874,9 +2032,9 @@ out: static void ceph_fault(struct ceph_connection *con) { pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), con->error_msg); + ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); dout("fault %p state %lu to peer %s\n", - con, con->state, pr_addr(&con->peer_addr.in_addr)); + con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); if (test_bit(LOSSYTX, &con->state)) { dout("fault on LOSSYTX channel\n"); @@ -1936,7 +2094,9 @@ out: /* * create a new messenger instance */ -struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) +struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr, + u32 supported_features, + u32 required_features) { struct ceph_messenger *msgr; @@ -1944,6 +2104,9 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) if (msgr == NULL) return ERR_PTR(-ENOMEM); + msgr->supported_features = supported_features; + msgr->required_features = required_features; + spin_lock_init(&msgr->global_seq_lock); /* the zero page is needed if a request is "canceled" while the message @@ -1966,6 +2129,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) dout("messenger_create %p\n", msgr); return msgr; } +EXPORT_SYMBOL(ceph_messenger_create); void ceph_messenger_destroy(struct ceph_messenger *msgr) { @@ -1975,6 +2139,7 @@ void ceph_messenger_destroy(struct ceph_messenger *msgr) kfree(msgr); dout("destroyed messenger %p\n", msgr); } +EXPORT_SYMBOL(ceph_messenger_destroy); /* * Queue up an outgoing message on the given connection. @@ -2011,6 +2176,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) queue_con(con); } +EXPORT_SYMBOL(ceph_con_send); /* * Revoke a message that was previously queued for send @@ -2076,6 +2242,7 @@ void ceph_con_keepalive(struct ceph_connection *con) test_and_set_bit(WRITE_PENDING, &con->state) == 0) queue_con(con); } +EXPORT_SYMBOL(ceph_con_keepalive); /* @@ -2136,6 +2303,10 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) m->nr_pages = 0; m->pages = NULL; m->pagelist = NULL; + m->bio = NULL; + m->bio_iter = NULL; + m->bio_seg = 0; + m->trail = NULL; dout("ceph_msg_new %p front %d\n", m, front_len); return m; @@ -2146,6 +2317,7 @@ out: pr_err("msg_new can't create type %d front %d\n", type, front_len); return NULL; } +EXPORT_SYMBOL(ceph_msg_new); /* * Allocate "middle" portion of a message, if it is needed and wasn't @@ -2250,11 +2422,14 @@ void ceph_msg_last_put(struct kref *kref) m->pagelist = NULL; } + m->trail = NULL; + if (m->pool) ceph_msgpool_put(m->pool, m); else ceph_msg_kfree(m); } +EXPORT_SYMBOL(ceph_msg_last_put); void ceph_msg_dump(struct ceph_msg *msg) { @@ -2275,3 +2450,4 @@ void ceph_msg_dump(struct ceph_msg *msg) DUMP_PREFIX_OFFSET, 16, 1, &msg->footer, sizeof(msg->footer), true); } +EXPORT_SYMBOL(ceph_msg_dump); diff --git a/fs/ceph/mon_client.c b/net/ceph/mon_client.c index b2a5a3e4a671..8a079399174a 100644 --- a/fs/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -1,14 +1,16 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> +#include <linux/module.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/random.h> #include <linux/sched.h> -#include "mon_client.h" -#include "super.h" -#include "auth.h" -#include "decode.h" +#include <linux/ceph/mon_client.h> +#include <linux/ceph/libceph.h> +#include <linux/ceph/decode.h> + +#include <linux/ceph/auth.h> /* * Interact with Ceph monitor cluster. Handle requests for new map @@ -74,7 +76,7 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end) m->num_mon); for (i = 0; i < m->num_mon; i++) dout("monmap_decode mon%d is %s\n", i, - pr_addr(&m->mon_inst[i].addr.in_addr)); + ceph_pr_addr(&m->mon_inst[i].addr.in_addr)); return m; bad: @@ -191,30 +193,33 @@ static void __send_subscribe(struct ceph_mon_client *monc) struct ceph_msg *msg = monc->m_subscribe; struct ceph_mon_subscribe_item *i; void *p, *end; + int num; p = msg->front.iov_base; end = p + msg->front_max; - dout("__send_subscribe to 'mdsmap' %u+\n", - (unsigned)monc->have_mdsmap); + num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; + ceph_encode_32(&p, num); + if (monc->want_next_osdmap) { dout("__send_subscribe to 'osdmap' %u\n", (unsigned)monc->have_osdmap); - ceph_encode_32(&p, 3); ceph_encode_string(&p, end, "osdmap", 6); i = p; i->have = cpu_to_le64(monc->have_osdmap); i->onetime = 1; p += sizeof(*i); monc->want_next_osdmap = 2; /* requested */ - } else { - ceph_encode_32(&p, 2); } - ceph_encode_string(&p, end, "mdsmap", 6); - i = p; - i->have = cpu_to_le64(monc->have_mdsmap); - i->onetime = 0; - p += sizeof(*i); + if (monc->want_mdsmap) { + dout("__send_subscribe to 'mdsmap' %u+\n", + (unsigned)monc->have_mdsmap); + ceph_encode_string(&p, end, "mdsmap", 6); + i = p; + i->have = cpu_to_le64(monc->have_mdsmap); + i->onetime = 0; + p += sizeof(*i); + } ceph_encode_string(&p, end, "monmap", 6); i = p; i->have = 0; @@ -243,7 +248,8 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc, mutex_lock(&monc->mutex); if (monc->hunting) { pr_info("mon%d %s session established\n", - monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr)); + monc->cur_mon, + ceph_pr_addr(&monc->con->peer_addr.in_addr)); monc->hunting = false; } dout("handle_subscribe_ack after %d seconds\n", seconds); @@ -266,6 +272,7 @@ int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) mutex_unlock(&monc->mutex); return 0; } +EXPORT_SYMBOL(ceph_monc_got_mdsmap); int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) { @@ -310,6 +317,7 @@ int ceph_monc_open_session(struct ceph_mon_client *monc) mutex_unlock(&monc->mutex); return 0; } +EXPORT_SYMBOL(ceph_monc_open_session); /* * The monitor responds with mount ack indicate mount success. The @@ -540,6 +548,7 @@ out: kref_put(&req->kref, release_generic_request); return err; } +EXPORT_SYMBOL(ceph_monc_do_statfs); /* * pool ops @@ -651,6 +660,7 @@ int ceph_monc_create_snapid(struct ceph_mon_client *monc, pool, 0, (char *)snapid, sizeof(*snapid)); } +EXPORT_SYMBOL(ceph_monc_create_snapid); int ceph_monc_delete_snapid(struct ceph_mon_client *monc, u32 pool, u64 snapid) @@ -708,9 +718,9 @@ static void delayed_work(struct work_struct *work) */ static int build_initial_monmap(struct ceph_mon_client *monc) { - struct ceph_mount_args *args = monc->client->mount_args; - struct ceph_entity_addr *mon_addr = args->mon_addr; - int num_mon = args->num_mon; + struct ceph_options *opt = monc->client->options; + struct ceph_entity_addr *mon_addr = opt->mon_addr; + int num_mon = opt->num_mon; int i; /* build initial monmap */ @@ -728,11 +738,6 @@ static int build_initial_monmap(struct ceph_mon_client *monc) } monc->monmap->num_mon = num_mon; monc->have_fsid = false; - - /* release addr memory */ - kfree(args->mon_addr); - args->mon_addr = NULL; - args->num_mon = 0; return 0; } @@ -753,8 +758,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) monc->con = NULL; /* authentication */ - monc->auth = ceph_auth_init(cl->mount_args->name, - cl->mount_args->secret); + monc->auth = ceph_auth_init(cl->options->name, + cl->options->secret); if (IS_ERR(monc->auth)) return PTR_ERR(monc->auth); monc->auth->want_keys = @@ -808,6 +813,7 @@ out_monmap: out: return err; } +EXPORT_SYMBOL(ceph_monc_init); void ceph_monc_stop(struct ceph_mon_client *monc) { @@ -832,6 +838,7 @@ void ceph_monc_stop(struct ceph_mon_client *monc) kfree(monc->monmap); } +EXPORT_SYMBOL(ceph_monc_stop); static void handle_auth_reply(struct ceph_mon_client *monc, struct ceph_msg *msg) @@ -889,6 +896,7 @@ int ceph_monc_validate_auth(struct ceph_mon_client *monc) mutex_unlock(&monc->mutex); return ret; } +EXPORT_SYMBOL(ceph_monc_validate_auth); /* * handle incoming message @@ -922,15 +930,16 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) ceph_monc_handle_map(monc, msg); break; - case CEPH_MSG_MDS_MAP: - ceph_mdsc_handle_map(&monc->client->mdsc, msg); - break; - case CEPH_MSG_OSD_MAP: ceph_osdc_handle_map(&monc->client->osdc, msg); break; default: + /* can the chained handler handle it? */ + if (monc->client->extra_mon_dispatch && + monc->client->extra_mon_dispatch(monc->client, msg) == 0) + break; + pr_err("received unknown message type %d %s\n", type, ceph_msg_type_name(type)); } @@ -994,7 +1003,7 @@ static void mon_fault(struct ceph_connection *con) if (monc->con && !monc->hunting) pr_info("mon%d %s session lost, " "hunting for new mon\n", monc->cur_mon, - pr_addr(&monc->con->peer_addr.in_addr)); + ceph_pr_addr(&monc->con->peer_addr.in_addr)); __close_session(monc); if (!monc->hunting) { diff --git a/fs/ceph/msgpool.c b/net/ceph/msgpool.c index dd65a6438131..d5f2d97ac05c 100644 --- a/fs/ceph/msgpool.c +++ b/net/ceph/msgpool.c @@ -1,11 +1,11 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/types.h> #include <linux/vmalloc.h> -#include "msgpool.h" +#include <linux/ceph/msgpool.h> static void *alloc_fn(gfp_t gfp_mask, void *arg) { diff --git a/fs/ceph/osd_client.c b/net/ceph/osd_client.c index dfced1dacbcd..79391994b3ed 100644 --- a/fs/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1,17 +1,22 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> +#include <linux/module.h> #include <linux/err.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/uaccess.h> +#ifdef CONFIG_BLOCK +#include <linux/bio.h> +#endif -#include "super.h" -#include "osd_client.h" -#include "messenger.h" -#include "decode.h" -#include "auth.h" +#include <linux/ceph/libceph.h> +#include <linux/ceph/osd_client.h> +#include <linux/ceph/messenger.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/auth.h> +#include <linux/ceph/pagelist.h> #define OSD_OP_FRONT_LEN 4096 #define OSD_OPREPLY_FRONT_LEN 512 @@ -22,6 +27,59 @@ static int __kick_requests(struct ceph_osd_client *osdc, static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); +static int op_needs_trail(int op) +{ + switch (op) { + case CEPH_OSD_OP_GETXATTR: + case CEPH_OSD_OP_SETXATTR: + case CEPH_OSD_OP_CMPXATTR: + case CEPH_OSD_OP_CALL: + return 1; + default: + return 0; + } +} + +static int op_has_extent(int op) +{ + return (op == CEPH_OSD_OP_READ || + op == CEPH_OSD_OP_WRITE); +} + +void ceph_calc_raw_layout(struct ceph_osd_client *osdc, + struct ceph_file_layout *layout, + u64 snapid, + u64 off, u64 *plen, u64 *bno, + struct ceph_osd_request *req, + struct ceph_osd_req_op *op) +{ + struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; + u64 orig_len = *plen; + u64 objoff, objlen; /* extent in object */ + + reqhead->snapid = cpu_to_le64(snapid); + + /* object extent? */ + ceph_calc_file_object_mapping(layout, off, plen, bno, + &objoff, &objlen); + if (*plen < orig_len) + dout(" skipping last %llu, final file extent %llu~%llu\n", + orig_len - *plen, off, *plen); + + if (op_has_extent(op->op)) { + op->extent.offset = objoff; + op->extent.length = objlen; + } + req->r_num_pages = calc_pages_for(off, *plen); + if (op->op == CEPH_OSD_OP_WRITE) + op->payload_len = *plen; + + dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", + *bno, objoff, objlen, req->r_num_pages); + +} +EXPORT_SYMBOL(ceph_calc_raw_layout); + /* * Implement client access to distributed object storage cluster. * @@ -48,34 +106,19 @@ static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); * fill osd op in request message. */ static void calc_layout(struct ceph_osd_client *osdc, - struct ceph_vino vino, struct ceph_file_layout *layout, + struct ceph_vino vino, + struct ceph_file_layout *layout, u64 off, u64 *plen, - struct ceph_osd_request *req) + struct ceph_osd_request *req, + struct ceph_osd_req_op *op) { - struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; - struct ceph_osd_op *op = (void *)(reqhead + 1); - u64 orig_len = *plen; - u64 objoff, objlen; /* extent in object */ u64 bno; - reqhead->snapid = cpu_to_le64(vino.snap); - - /* object extent? */ - ceph_calc_file_object_mapping(layout, off, plen, &bno, - &objoff, &objlen); - if (*plen < orig_len) - dout(" skipping last %llu, final file extent %llu~%llu\n", - orig_len - *plen, off, *plen); + ceph_calc_raw_layout(osdc, layout, vino.snap, off, + plen, &bno, req, op); sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno); req->r_oid_len = strlen(req->r_oid); - - op->extent.offset = cpu_to_le64(objoff); - op->extent.length = cpu_to_le64(objlen); - req->r_num_pages = calc_pages_for(off, *plen); - - dout("calc_layout %s (%d) %llu~%llu (%d pages)\n", - req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages); } /* @@ -101,56 +144,66 @@ void ceph_osdc_release_request(struct kref *kref) if (req->r_own_pages) ceph_release_page_vector(req->r_pages, req->r_num_pages); +#ifdef CONFIG_BLOCK + if (req->r_bio) + bio_put(req->r_bio); +#endif ceph_put_snap_context(req->r_snapc); + if (req->r_trail) { + ceph_pagelist_release(req->r_trail); + kfree(req->r_trail); + } if (req->r_mempool) mempool_free(req, req->r_osdc->req_mempool); else kfree(req); } +EXPORT_SYMBOL(ceph_osdc_release_request); -/* - * build new request AND message, calculate layout, and adjust file - * extent as needed. - * - * if the file was recently truncated, we include information about its - * old and new size so that the object can be updated appropriately. (we - * avoid synchronously deleting truncated objects because it's slow.) - * - * if @do_sync, include a 'startsync' command so that the osd will flush - * data quickly. - */ -struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, - struct ceph_file_layout *layout, - struct ceph_vino vino, - u64 off, u64 *plen, - int opcode, int flags, +static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail) +{ + int i = 0; + + if (needs_trail) + *needs_trail = 0; + while (ops[i].op) { + if (needs_trail && op_needs_trail(ops[i].op)) + *needs_trail = 1; + i++; + } + + return i; +} + +struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, + int flags, struct ceph_snap_context *snapc, - int do_sync, - u32 truncate_seq, - u64 truncate_size, - struct timespec *mtime, - bool use_mempool, int num_reply) + struct ceph_osd_req_op *ops, + bool use_mempool, + gfp_t gfp_flags, + struct page **pages, + struct bio *bio) { struct ceph_osd_request *req; struct ceph_msg *msg; - struct ceph_osd_request_head *head; - struct ceph_osd_op *op; - void *p; - int num_op = 1 + do_sync; - size_t msg_size = sizeof(*head) + num_op*sizeof(*op); - int i; + int needs_trail; + int num_op = get_num_ops(ops, &needs_trail); + size_t msg_size = sizeof(struct ceph_osd_request_head); + + msg_size += num_op*sizeof(struct ceph_osd_op); if (use_mempool) { - req = mempool_alloc(osdc->req_mempool, GFP_NOFS); + req = mempool_alloc(osdc->req_mempool, gfp_flags); memset(req, 0, sizeof(*req)); } else { - req = kzalloc(sizeof(*req), GFP_NOFS); + req = kzalloc(sizeof(*req), gfp_flags); } if (req == NULL) return NULL; req->r_osdc = osdc; req->r_mempool = use_mempool; + kref_init(&req->r_kref); init_completion(&req->r_completion); init_completion(&req->r_safe_completion); @@ -164,13 +217,22 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); else msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, - OSD_OPREPLY_FRONT_LEN, GFP_NOFS); + OSD_OPREPLY_FRONT_LEN, gfp_flags); if (!msg) { ceph_osdc_put_request(req); return NULL; } req->r_reply = msg; + /* allocate space for the trailing data */ + if (needs_trail) { + req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags); + if (!req->r_trail) { + ceph_osdc_put_request(req); + return NULL; + } + ceph_pagelist_init(req->r_trail); + } /* create request message; allow space for oid */ msg_size += 40; if (snapc) @@ -178,18 +240,115 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, 0); else - msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags); if (!msg) { ceph_osdc_put_request(req); return NULL; } + msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); memset(msg->front.iov_base, 0, msg->front.iov_len); + + req->r_request = msg; + req->r_pages = pages; +#ifdef CONFIG_BLOCK + if (bio) { + req->r_bio = bio; + bio_get(req->r_bio); + } +#endif + + return req; +} +EXPORT_SYMBOL(ceph_osdc_alloc_request); + +static void osd_req_encode_op(struct ceph_osd_request *req, + struct ceph_osd_op *dst, + struct ceph_osd_req_op *src) +{ + dst->op = cpu_to_le16(src->op); + + switch (dst->op) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_WRITE: + dst->extent.offset = + cpu_to_le64(src->extent.offset); + dst->extent.length = + cpu_to_le64(src->extent.length); + dst->extent.truncate_size = + cpu_to_le64(src->extent.truncate_size); + dst->extent.truncate_seq = + cpu_to_le32(src->extent.truncate_seq); + break; + + case CEPH_OSD_OP_GETXATTR: + case CEPH_OSD_OP_SETXATTR: + case CEPH_OSD_OP_CMPXATTR: + BUG_ON(!req->r_trail); + + dst->xattr.name_len = cpu_to_le32(src->xattr.name_len); + dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); + dst->xattr.cmp_op = src->xattr.cmp_op; + dst->xattr.cmp_mode = src->xattr.cmp_mode; + ceph_pagelist_append(req->r_trail, src->xattr.name, + src->xattr.name_len); + ceph_pagelist_append(req->r_trail, src->xattr.val, + src->xattr.value_len); + break; + case CEPH_OSD_OP_CALL: + BUG_ON(!req->r_trail); + + dst->cls.class_len = src->cls.class_len; + dst->cls.method_len = src->cls.method_len; + dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); + + ceph_pagelist_append(req->r_trail, src->cls.class_name, + src->cls.class_len); + ceph_pagelist_append(req->r_trail, src->cls.method_name, + src->cls.method_len); + ceph_pagelist_append(req->r_trail, src->cls.indata, + src->cls.indata_len); + break; + case CEPH_OSD_OP_ROLLBACK: + dst->snap.snapid = cpu_to_le64(src->snap.snapid); + break; + case CEPH_OSD_OP_STARTSYNC: + break; + default: + pr_err("unrecognized osd opcode %d\n", dst->op); + WARN_ON(1); + break; + } + dst->payload_len = cpu_to_le32(src->payload_len); +} + +/* + * build new request AND message + * + */ +void ceph_osdc_build_request(struct ceph_osd_request *req, + u64 off, u64 *plen, + struct ceph_osd_req_op *src_ops, + struct ceph_snap_context *snapc, + struct timespec *mtime, + const char *oid, + int oid_len) +{ + struct ceph_msg *msg = req->r_request; + struct ceph_osd_request_head *head; + struct ceph_osd_req_op *src_op; + struct ceph_osd_op *op; + void *p; + int num_op = get_num_ops(src_ops, NULL); + size_t msg_size = sizeof(*head) + num_op*sizeof(*op); + int flags = req->r_flags; + u64 data_len = 0; + int i; + head = msg->front.iov_base; op = (void *)(head + 1); p = (void *)(op + num_op); - req->r_request = msg; req->r_snapc = ceph_get_snap_context(snapc); head->client_inc = cpu_to_le32(1); /* always, for now. */ @@ -197,29 +356,23 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (flags & CEPH_OSD_FLAG_WRITE) ceph_encode_timespec(&head->mtime, mtime); head->num_ops = cpu_to_le16(num_op); - op->op = cpu_to_le16(opcode); - /* calculate max write size */ - calc_layout(osdc, vino, layout, off, plen, req); - req->r_file_layout = *layout; /* keep a copy */ - - if (flags & CEPH_OSD_FLAG_WRITE) { - req->r_request->hdr.data_off = cpu_to_le16(off); - req->r_request->hdr.data_len = cpu_to_le32(*plen); - op->payload_len = cpu_to_le32(*plen); - } - op->extent.truncate_size = cpu_to_le64(truncate_size); - op->extent.truncate_seq = cpu_to_le32(truncate_seq); /* fill in oid */ - head->object_len = cpu_to_le32(req->r_oid_len); - memcpy(p, req->r_oid, req->r_oid_len); - p += req->r_oid_len; - - if (do_sync) { + head->object_len = cpu_to_le32(oid_len); + memcpy(p, oid, oid_len); + p += oid_len; + + src_op = src_ops; + while (src_op->op) { + osd_req_encode_op(req, op, src_op); + src_op++; op++; - op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC); } + + if (req->r_trail) + data_len += req->r_trail->length; + if (snapc) { head->snap_seq = cpu_to_le64(snapc->seq); head->num_snaps = cpu_to_le32(snapc->num_snaps); @@ -229,12 +382,79 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, } } + if (flags & CEPH_OSD_FLAG_WRITE) { + req->r_request->hdr.data_off = cpu_to_le16(off); + req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len); + } else if (data_len) { + req->r_request->hdr.data_off = 0; + req->r_request->hdr.data_len = cpu_to_le32(data_len); + } + BUG_ON(p > msg->front.iov_base + msg->front.iov_len); msg_size = p - msg->front.iov_base; msg->front.iov_len = msg_size; msg->hdr.front_len = cpu_to_le32(msg_size); + return; +} +EXPORT_SYMBOL(ceph_osdc_build_request); + +/* + * build new request AND message, calculate layout, and adjust file + * extent as needed. + * + * if the file was recently truncated, we include information about its + * old and new size so that the object can be updated appropriately. (we + * avoid synchronously deleting truncated objects because it's slow.) + * + * if @do_sync, include a 'startsync' command so that the osd will flush + * data quickly. + */ +struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, + struct ceph_file_layout *layout, + struct ceph_vino vino, + u64 off, u64 *plen, + int opcode, int flags, + struct ceph_snap_context *snapc, + int do_sync, + u32 truncate_seq, + u64 truncate_size, + struct timespec *mtime, + bool use_mempool, int num_reply) +{ + struct ceph_osd_req_op ops[3]; + struct ceph_osd_request *req; + + ops[0].op = opcode; + ops[0].extent.truncate_seq = truncate_seq; + ops[0].extent.truncate_size = truncate_size; + ops[0].payload_len = 0; + + if (do_sync) { + ops[1].op = CEPH_OSD_OP_STARTSYNC; + ops[1].payload_len = 0; + ops[2].op = 0; + } else + ops[1].op = 0; + + req = ceph_osdc_alloc_request(osdc, flags, + snapc, ops, + use_mempool, + GFP_NOFS, NULL, NULL); + if (IS_ERR(req)) + return req; + + /* calculate max write size */ + calc_layout(osdc, vino, layout, off, plen, req, ops); + req->r_file_layout = *layout; /* keep a copy */ + + ceph_osdc_build_request(req, off, plen, ops, + snapc, + mtime, + req->r_oid, req->r_oid_len); + return req; } +EXPORT_SYMBOL(ceph_osdc_new_request); /* * We keep osd requests in an rbtree, sorted by ->r_tid. @@ -389,7 +609,7 @@ static void __move_osd_to_lru(struct ceph_osd_client *osdc, dout("__move_osd_to_lru %p\n", osd); BUG_ON(!list_empty(&osd->o_osd_lru)); list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); - osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ; + osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ; } static void __remove_osd_from_lru(struct ceph_osd *osd) @@ -483,7 +703,7 @@ static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) static void __schedule_osd_timeout(struct ceph_osd_client *osdc) { schedule_delayed_work(&osdc->timeout_work, - osdc->client->mount_args->osd_keepalive_timeout * HZ); + osdc->client->options->osd_keepalive_timeout * HZ); } static void __cancel_osd_timeout(struct ceph_osd_client *osdc) @@ -549,7 +769,7 @@ static void __unregister_request(struct ceph_osd_client *osdc, */ static void __cancel_request(struct ceph_osd_request *req) { - if (req->r_sent) { + if (req->r_sent && req->r_osd) { ceph_con_revoke(&req->r_osd->o_con, req->r_request); req->r_sent = 0; } @@ -684,9 +904,9 @@ static void handle_timeout(struct work_struct *work) container_of(work, struct ceph_osd_client, timeout_work.work); struct ceph_osd_request *req, *last_req = NULL; struct ceph_osd *osd; - unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ; + unsigned long timeout = osdc->client->options->osd_timeout * HZ; unsigned long keepalive = - osdc->client->mount_args->osd_keepalive_timeout * HZ; + osdc->client->options->osd_keepalive_timeout * HZ; unsigned long last_stamp = 0; struct rb_node *p; struct list_head slow_osds; @@ -773,7 +993,7 @@ static void handle_osds_timeout(struct work_struct *work) container_of(work, struct ceph_osd_client, osds_timeout_work.work); unsigned long delay = - osdc->client->mount_args->osd_idle_ttl * HZ >> 2; + osdc->client->options->osd_idle_ttl * HZ >> 2; dout("osds timeout\n"); down_read(&osdc->map_sem); @@ -1104,6 +1324,10 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, req->r_request->pages = req->r_pages; req->r_request->nr_pages = req->r_num_pages; +#ifdef CONFIG_BLOCK + req->r_request->bio = req->r_bio; +#endif + req->r_request->trail = req->r_trail; register_request(osdc, req); @@ -1131,6 +1355,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, up_read(&osdc->map_sem); return rc; } +EXPORT_SYMBOL(ceph_osdc_start_request); /* * wait for a request to complete @@ -1153,6 +1378,7 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc, dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result); return req->r_result; } +EXPORT_SYMBOL(ceph_osdc_wait_request); /* * sync - wait for all in-flight requests to flush. avoid starvation. @@ -1186,6 +1412,7 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc) mutex_unlock(&osdc->request_mutex); dout("sync done (thru tid %llu)\n", last_tid); } +EXPORT_SYMBOL(ceph_osdc_sync); /* * init, shutdown @@ -1211,7 +1438,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); schedule_delayed_work(&osdc->osds_timeout_work, - round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ)); + round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ)); err = -ENOMEM; osdc->req_mempool = mempool_create_kmalloc_pool(10, @@ -1237,6 +1464,7 @@ out_mempool: out: return err; } +EXPORT_SYMBOL(ceph_osdc_init); void ceph_osdc_stop(struct ceph_osd_client *osdc) { @@ -1251,6 +1479,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) ceph_msgpool_destroy(&osdc->msgpool_op); ceph_msgpool_destroy(&osdc->msgpool_op_reply); } +EXPORT_SYMBOL(ceph_osdc_stop); /* * Read some contiguous pages. If we cross a stripe boundary, shorten @@ -1288,6 +1517,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, dout("readpages result %d\n", rc); return rc; } +EXPORT_SYMBOL(ceph_osdc_readpages); /* * do a synchronous write on N pages @@ -1330,6 +1560,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, dout("writepages result %d\n", rc); return rc; } +EXPORT_SYMBOL(ceph_osdc_writepages); /* * handle incoming message @@ -1420,6 +1651,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, } m->pages = req->r_pages; m->nr_pages = req->r_num_pages; +#ifdef CONFIG_BLOCK + m->bio = req->r_bio; +#endif } *skip = 0; req->r_con_filling_msg = ceph_con_get(con); diff --git a/fs/ceph/osdmap.c b/net/ceph/osdmap.c index e31f118f1392..d73f3f6efa36 100644 --- a/fs/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1,14 +1,15 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> +#include <linux/module.h> #include <linux/slab.h> #include <asm/div64.h> -#include "super.h" -#include "osdmap.h" -#include "crush/hash.h" -#include "crush/mapper.h" -#include "decode.h" +#include <linux/ceph/libceph.h> +#include <linux/ceph/osdmap.h> +#include <linux/ceph/decode.h> +#include <linux/crush/hash.h> +#include <linux/crush/mapper.h> char *ceph_osdmap_state_str(char *str, int len, int state) { @@ -417,6 +418,20 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) return NULL; } +int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name) +{ + struct rb_node *rbp; + + for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) { + struct ceph_pg_pool_info *pi = + rb_entry(rbp, struct ceph_pg_pool_info, node); + if (pi->name && strcmp(pi->name, name) == 0) + return pi->id; + } + return -ENOENT; +} +EXPORT_SYMBOL(ceph_pg_poolid_by_name); + static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) { rb_erase(&pi->node, root); @@ -966,6 +981,7 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); } +EXPORT_SYMBOL(ceph_calc_file_object_mapping); /* * calculate an object layout (i.e. pgid) from an oid, @@ -1011,6 +1027,7 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol, ol->ol_stripe_unit = fl->fl_object_stripe_unit; return 0; } +EXPORT_SYMBOL(ceph_calc_object_layout); /* * Calculate raw osd vector for the given pgid. Return pointer to osd @@ -1108,3 +1125,4 @@ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) return osds[i]; return -1; } +EXPORT_SYMBOL(ceph_calc_pg_primary); diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c new file mode 100644 index 000000000000..13cb409a7bba --- /dev/null +++ b/net/ceph/pagelist.c @@ -0,0 +1,154 @@ + +#include <linux/module.h> +#include <linux/gfp.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/ceph/pagelist.h> + +static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl) +{ + if (pl->mapped_tail) { + struct page *page = list_entry(pl->head.prev, struct page, lru); + kunmap(page); + pl->mapped_tail = NULL; + } +} + +int ceph_pagelist_release(struct ceph_pagelist *pl) +{ + ceph_pagelist_unmap_tail(pl); + while (!list_empty(&pl->head)) { + struct page *page = list_first_entry(&pl->head, struct page, + lru); + list_del(&page->lru); + __free_page(page); + } + ceph_pagelist_free_reserve(pl); + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_release); + +static int ceph_pagelist_addpage(struct ceph_pagelist *pl) +{ + struct page *page; + + if (!pl->num_pages_free) { + page = __page_cache_alloc(GFP_NOFS); + } else { + page = list_first_entry(&pl->free_list, struct page, lru); + list_del(&page->lru); + --pl->num_pages_free; + } + if (!page) + return -ENOMEM; + pl->room += PAGE_SIZE; + ceph_pagelist_unmap_tail(pl); + list_add_tail(&page->lru, &pl->head); + pl->mapped_tail = kmap(page); + return 0; +} + +int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len) +{ + while (pl->room < len) { + size_t bit = pl->room; + int ret; + + memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), + buf, bit); + pl->length += bit; + pl->room -= bit; + buf += bit; + len -= bit; + ret = ceph_pagelist_addpage(pl); + if (ret) + return ret; + } + + memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len); + pl->length += len; + pl->room -= len; + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_append); + +/** + * Allocate enough pages for a pagelist to append the given amount + * of data without without allocating. + * Returns: 0 on success, -ENOMEM on error. + */ +int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space) +{ + if (space <= pl->room) + return 0; + space -= pl->room; + space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT; /* conv to num pages */ + + while (space > pl->num_pages_free) { + struct page *page = __page_cache_alloc(GFP_NOFS); + if (!page) + return -ENOMEM; + list_add_tail(&page->lru, &pl->free_list); + ++pl->num_pages_free; + } + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_reserve); + +/** + * Free any pages that have been preallocated. + */ +int ceph_pagelist_free_reserve(struct ceph_pagelist *pl) +{ + while (!list_empty(&pl->free_list)) { + struct page *page = list_first_entry(&pl->free_list, + struct page, lru); + list_del(&page->lru); + __free_page(page); + --pl->num_pages_free; + } + BUG_ON(pl->num_pages_free); + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_free_reserve); + +/** + * Create a truncation point. + */ +void ceph_pagelist_set_cursor(struct ceph_pagelist *pl, + struct ceph_pagelist_cursor *c) +{ + c->pl = pl; + c->page_lru = pl->head.prev; + c->room = pl->room; +} +EXPORT_SYMBOL(ceph_pagelist_set_cursor); + +/** + * Truncate a pagelist to the given point. Move extra pages to reserve. + * This won't sleep. + * Returns: 0 on success, + * -EINVAL if the pagelist doesn't match the trunc point pagelist + */ +int ceph_pagelist_truncate(struct ceph_pagelist *pl, + struct ceph_pagelist_cursor *c) +{ + struct page *page; + + if (pl != c->pl) + return -EINVAL; + ceph_pagelist_unmap_tail(pl); + while (pl->head.prev != c->page_lru) { + page = list_entry(pl->head.prev, struct page, lru); + list_del(&page->lru); /* remove from pagelist */ + list_add_tail(&page->lru, &pl->free_list); /* add to reserve */ + ++pl->num_pages_free; + } + pl->room = c->room; + if (!list_empty(&pl->head)) { + page = list_entry(pl->head.prev, struct page, lru); + pl->mapped_tail = kmap(page); + } + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_truncate); diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c new file mode 100644 index 000000000000..54caf0687155 --- /dev/null +++ b/net/ceph/pagevec.c @@ -0,0 +1,223 @@ +#include <linux/ceph/ceph_debug.h> + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/namei.h> +#include <linux/writeback.h> + +#include <linux/ceph/libceph.h> + +/* + * build a vector of user pages + */ +struct page **ceph_get_direct_page_vector(const char __user *data, + int num_pages, + loff_t off, size_t len) +{ + struct page **pages; + int rc; + + pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); + if (!pages) + return ERR_PTR(-ENOMEM); + + down_read(¤t->mm->mmap_sem); + rc = get_user_pages(current, current->mm, (unsigned long)data, + num_pages, 0, 0, pages, NULL); + up_read(¤t->mm->mmap_sem); + if (rc < 0) + goto fail; + return pages; + +fail: + kfree(pages); + return ERR_PTR(rc); +} +EXPORT_SYMBOL(ceph_get_direct_page_vector); + +void ceph_put_page_vector(struct page **pages, int num_pages) +{ + int i; + + for (i = 0; i < num_pages; i++) + put_page(pages[i]); + kfree(pages); +} +EXPORT_SYMBOL(ceph_put_page_vector); + +void ceph_release_page_vector(struct page **pages, int num_pages) +{ + int i; + + for (i = 0; i < num_pages; i++) + __free_pages(pages[i], 0); + kfree(pages); +} +EXPORT_SYMBOL(ceph_release_page_vector); + +/* + * allocate a vector new pages + */ +struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) +{ + struct page **pages; + int i; + + pages = kmalloc(sizeof(*pages) * num_pages, flags); + if (!pages) + return ERR_PTR(-ENOMEM); + for (i = 0; i < num_pages; i++) { + pages[i] = __page_cache_alloc(flags); + if (pages[i] == NULL) { + ceph_release_page_vector(pages, i); + return ERR_PTR(-ENOMEM); + } + } + return pages; +} +EXPORT_SYMBOL(ceph_alloc_page_vector); + +/* + * copy user data into a page vector + */ +int ceph_copy_user_to_page_vector(struct page **pages, + const char __user *data, + loff_t off, size_t len) +{ + int i = 0; + int po = off & ~PAGE_CACHE_MASK; + int left = len; + int l, bad; + + while (left > 0) { + l = min_t(int, PAGE_CACHE_SIZE-po, left); + bad = copy_from_user(page_address(pages[i]) + po, data, l); + if (bad == l) + return -EFAULT; + data += l - bad; + left -= l - bad; + po += l - bad; + if (po == PAGE_CACHE_SIZE) { + po = 0; + i++; + } + } + return len; +} +EXPORT_SYMBOL(ceph_copy_user_to_page_vector); + +int ceph_copy_to_page_vector(struct page **pages, + const char *data, + loff_t off, size_t len) +{ + int i = 0; + size_t po = off & ~PAGE_CACHE_MASK; + size_t left = len; + size_t l; + + while (left > 0) { + l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + memcpy(page_address(pages[i]) + po, data, l); + data += l; + left -= l; + po += l; + if (po == PAGE_CACHE_SIZE) { + po = 0; + i++; + } + } + return len; +} +EXPORT_SYMBOL(ceph_copy_to_page_vector); + +int ceph_copy_from_page_vector(struct page **pages, + char *data, + loff_t off, size_t len) +{ + int i = 0; + size_t po = off & ~PAGE_CACHE_MASK; + size_t left = len; + size_t l; + + while (left > 0) { + l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + memcpy(data, page_address(pages[i]) + po, l); + data += l; + left -= l; + po += l; + if (po == PAGE_CACHE_SIZE) { + po = 0; + i++; + } + } + return len; +} +EXPORT_SYMBOL(ceph_copy_from_page_vector); + +/* + * copy user data from a page vector into a user pointer + */ +int ceph_copy_page_vector_to_user(struct page **pages, + char __user *data, + loff_t off, size_t len) +{ + int i = 0; + int po = off & ~PAGE_CACHE_MASK; + int left = len; + int l, bad; + + while (left > 0) { + l = min_t(int, left, PAGE_CACHE_SIZE-po); + bad = copy_to_user(data, page_address(pages[i]) + po, l); + if (bad == l) + return -EFAULT; + data += l - bad; + left -= l - bad; + if (po) { + po += l - bad; + if (po == PAGE_CACHE_SIZE) + po = 0; + } + i++; + } + return len; +} +EXPORT_SYMBOL(ceph_copy_page_vector_to_user); + +/* + * Zero an extent within a page vector. Offset is relative to the + * start of the first page. + */ +void ceph_zero_page_vector_range(int off, int len, struct page **pages) +{ + int i = off >> PAGE_CACHE_SHIFT; + + off &= ~PAGE_CACHE_MASK; + + dout("zero_page_vector_page %u~%u\n", off, len); + + /* leading partial page? */ + if (off) { + int end = min((int)PAGE_CACHE_SIZE, off + len); + dout("zeroing %d %p head from %d\n", i, pages[i], + (int)off); + zero_user_segment(pages[i], off, end); + len -= (end - off); + i++; + } + while (len >= PAGE_CACHE_SIZE) { + dout("zeroing %d %p len=%d\n", i, pages[i], len); + zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); + len -= PAGE_CACHE_SIZE; + i++; + } + /* trailing partial page? */ + if (len) { + dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len); + zero_user_segment(pages[i], 0, len); + } +} +EXPORT_SYMBOL(ceph_zero_page_vector_range); + diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 7a85367b3c2f..8451ab481095 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -348,7 +348,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, if (info.cmd == ETHTOOL_GRXCLSRLALL) { if (info.rule_cnt > 0) { if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) - rule_buf = kmalloc(info.rule_cnt * sizeof(u32), + rule_buf = kzalloc(info.rule_cnt * sizeof(u32), GFP_USER); if (!rule_buf) return -ENOMEM; @@ -397,7 +397,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index)) return -ENOMEM; full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size; - indir = kmalloc(full_size, GFP_USER); + indir = kzalloc(full_size, GFP_USER); if (!indir) return -ENOMEM; @@ -538,7 +538,7 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr) gstrings.len = ret; - data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); + data = kzalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); if (!data) return -ENOMEM; @@ -775,7 +775,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) if (regs.len > reglen) regs.len = reglen; - regbuf = kmalloc(reglen, GFP_USER); + regbuf = kzalloc(reglen, GFP_USER); if (!regbuf) return -ENOMEM; diff --git a/net/core/sock.c b/net/core/sock.c index ef30e9d286e7..7d99e13148e6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1078,8 +1078,11 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) #ifdef CONFIG_CGROUPS void sock_update_classid(struct sock *sk) { - u32 classid = task_cls_classid(current); + u32 classid; + rcu_read_lock(); /* doing current task, which cannot vanish. */ + classid = task_cls_classid(current); + rcu_read_unlock(); if (classid && classid != sk->sk_classid) sk->sk_classid = classid; } diff --git a/net/core/stream.c b/net/core/stream.c index d959e0f41528..f5df85dcd20b 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -141,10 +141,10 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; - sk_wait_event(sk, ¤t_timeo, !sk->sk_err && - !(sk->sk_shutdown & SEND_SHUTDOWN) && - sk_stream_memory_free(sk) && - vm_wait); + sk_wait_event(sk, ¤t_timeo, sk->sk_err || + (sk->sk_shutdown & SEND_SHUTDOWN) || + (sk_stream_memory_free(sk) && + !vm_wait)); sk->sk_write_pending--; if (vm_wait) { diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 72380a30d1c8..7cd7760144f7 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -413,7 +413,7 @@ config INET_XFRM_MODE_BEET If unsure, say Y. config INET_LRO - bool "Large Receive Offload (ipv4/tcp)" + tristate "Large Receive Offload (ipv4/tcp)" default y ---help--- Support for Large Receive Offload (ipv4/tcp). diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 1fdcacd36ce7..2a4bb76f2132 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -834,7 +834,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, int mark = 0; - if (len == 8 || IGMP_V2_SEEN(in_dev)) { + if (len == 8) { if (ih->code == 0) { /* Alas, old v1 router presents here. */ @@ -856,6 +856,18 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, igmpv3_clear_delrec(in_dev); } else if (len < 12) { return; /* ignore bogus packet; freed by caller */ + } else if (IGMP_V1_SEEN(in_dev)) { + /* This is a v3 query with v1 queriers present */ + max_delay = IGMP_Query_Response_Interval; + group = 0; + } else if (IGMP_V2_SEEN(in_dev)) { + /* this is a v3 query with v2 queriers present; + * Interpretation of the max_delay code is problematic here. + * A real v2 host would use ih_code directly, while v3 has a + * different encoding. We use the v3 encoding as more likely + * to be intended in a v3 query. + */ + max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); } else { /* v3 */ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) return; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 244f7cb08d68..37f8adb68c79 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -11,6 +11,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/percpu.h> +#include <linux/security.h> #include <net/net_namespace.h> #include <linux/netfilter.h> @@ -87,6 +88,29 @@ static void ct_seq_stop(struct seq_file *s, void *v) rcu_read_unlock(); } +#ifdef CONFIG_NF_CONNTRACK_SECMARK +static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +{ + int ret; + u32 len; + char *secctx; + + ret = security_secid_to_secctx(ct->secmark, &secctx, &len); + if (ret) + return ret; + + ret = seq_printf(s, "secctx=%s ", secctx); + + security_release_secctx(secctx, len); + return ret; +} +#else +static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +{ + return 0; +} +#endif + static int ct_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_tuple_hash *hash = v; @@ -148,10 +172,8 @@ static int ct_seq_show(struct seq_file *s, void *v) goto release; #endif -#ifdef CONFIG_NF_CONNTRACK_SECMARK - if (seq_printf(s, "secmark=%u ", ct->secmark)) + if (ct_show_secctx(s, ct)) goto release; -#endif if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) goto release; diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 8c8632d9b93c..957c9241fb0c 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -38,7 +38,7 @@ static DEFINE_SPINLOCK(nf_nat_lock); static struct nf_conntrack_l3proto *l3proto __read_mostly; #define MAX_IP_NAT_PROTO 256 -static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] +static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO] __read_mostly; static inline const struct nf_nat_protocol * diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8323136bdc54..a275c6e1e25c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1556,14 +1556,13 @@ out: * i.e. Path MTU discovery */ -void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, - struct net_device *dev, u32 pmtu) +static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr, + struct net *net, u32 pmtu, int ifindex) { struct rt6_info *rt, *nrt; - struct net *net = dev_net(dev); int allfrag = 0; - rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0); + rt = rt6_lookup(net, daddr, saddr, ifindex, 0); if (rt == NULL) return; @@ -1631,6 +1630,27 @@ out: dst_release(&rt->dst); } +void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, + struct net_device *dev, u32 pmtu) +{ + struct net *net = dev_net(dev); + + /* + * RFC 1981 states that a node "MUST reduce the size of the packets it + * is sending along the path" that caused the Packet Too Big message. + * Since it's not possible in the general case to determine which + * interface was used to send the original packet, we update the MTU + * on the interface that will be used to send future packets. We also + * update the MTU on the interface that received the Packet Too Big in + * case the original packet was forced out that interface with + * SO_BINDTODEVICE or similar. This is the next best thing to the + * correct behaviour, which would be to update the MTU on all + * interfaces. + */ + rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0); + rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex); +} + /* * Misc support functions */ diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index c893f236acea..8f23401832b7 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -175,6 +175,8 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, set_bit(HT_AGG_STATE_STOPPING, &tid_tx->state); + del_timer_sync(&tid_tx->addba_resp_timer); + /* * After this packets are no longer handed right through * to the driver but are put onto tid_tx->pending instead, diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 10caec5ea8fa..34da67995d94 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -377,7 +377,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) { skb2->dev = prev_dev; - netif_receive_skb(skb2); + netif_rx(skb2); } } @@ -386,7 +386,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) } if (prev_dev) { skb->dev = prev_dev; - netif_receive_skb(skb); + netif_rx(skb); skb = NULL; } rcu_read_unlock(); diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 78b505d33bfb..fdaec7daff1d 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -27,7 +27,7 @@ static DEFINE_MUTEX(afinfo_mutex); -const struct nf_afinfo *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly; +const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly; EXPORT_SYMBOL(nf_afinfo); int nf_register_afinfo(const struct nf_afinfo *afinfo) diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index cdcc7649476b..5702de35e2bb 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -26,10 +26,10 @@ static DEFINE_MUTEX(nf_ct_ecache_mutex); -struct nf_ct_event_notifier *nf_conntrack_event_cb __read_mostly; +struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_event_cb); -struct nf_exp_event_notifier *nf_expect_event_cb __read_mostly; +struct nf_exp_event_notifier __rcu *nf_expect_event_cb __read_mostly; EXPORT_SYMBOL_GPL(nf_expect_event_cb); /* deliver cached events and clear cache entry - must be called with locally diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index 8d9e4c949b96..bd82450c193f 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -16,7 +16,7 @@ #include <linux/skbuff.h> #include <net/netfilter/nf_conntrack_extend.h> -static struct nf_ct_ext_type *nf_ct_ext_types[NF_CT_EXT_NUM]; +static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM]; static DEFINE_MUTEX(nf_ct_ext_type_mutex); void __nf_ct_ext_destroy(struct nf_conn *ct) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 5bae1cd15eea..146476c6441a 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -22,6 +22,7 @@ #include <linux/rculist_nulls.h> #include <linux/types.h> #include <linux/timer.h> +#include <linux/security.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/netlink.h> @@ -245,16 +246,31 @@ nla_put_failure: #ifdef CONFIG_NF_CONNTRACK_SECMARK static inline int -ctnetlink_dump_secmark(struct sk_buff *skb, const struct nf_conn *ct) +ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) { - NLA_PUT_BE32(skb, CTA_SECMARK, htonl(ct->secmark)); - return 0; + struct nlattr *nest_secctx; + int len, ret; + char *secctx; + + ret = security_secid_to_secctx(ct->secmark, &secctx, &len); + if (ret) + return ret; + + ret = -1; + nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED); + if (!nest_secctx) + goto nla_put_failure; + + NLA_PUT_STRING(skb, CTA_SECCTX_NAME, secctx); + nla_nest_end(skb, nest_secctx); + ret = 0; nla_put_failure: - return -1; + security_release_secctx(secctx, len); + return ret; } #else -#define ctnetlink_dump_secmark(a, b) (0) +#define ctnetlink_dump_secctx(a, b) (0) #endif #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) @@ -391,7 +407,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, ctnetlink_dump_protoinfo(skb, ct) < 0 || ctnetlink_dump_helpinfo(skb, ct) < 0 || ctnetlink_dump_mark(skb, ct) < 0 || - ctnetlink_dump_secmark(skb, ct) < 0 || + ctnetlink_dump_secctx(skb, ct) < 0 || ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || ctnetlink_dump_master(skb, ct) < 0 || @@ -437,6 +453,17 @@ ctnetlink_counters_size(const struct nf_conn *ct) ; } +#ifdef CONFIG_NF_CONNTRACK_SECMARK +static int ctnetlink_nlmsg_secctx_size(const struct nf_conn *ct) +{ + int len; + + security_secid_to_secctx(ct->secmark, NULL, &len); + + return sizeof(char) * len; +} +#endif + static inline size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) { @@ -453,7 +480,8 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) + nla_total_size(0) /* CTA_HELP */ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ #ifdef CONFIG_NF_CONNTRACK_SECMARK - + nla_total_size(sizeof(u_int32_t)) /* CTA_SECMARK */ + + nla_total_size(0) /* CTA_SECCTX */ + + nla_total_size(ctnetlink_nlmsg_secctx_size(ct)) /* CTA_SECCTX_NAME */ #endif #ifdef CONFIG_NF_NAT_NEEDED + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ @@ -556,7 +584,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) #ifdef CONFIG_NF_CONNTRACK_SECMARK if ((events & (1 << IPCT_SECMARK) || ct->secmark) - && ctnetlink_dump_secmark(skb, ct) < 0) + && ctnetlink_dump_secctx(skb, ct) < 0) goto nla_put_failure; #endif diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 5886ba1d52a0..ed6d92958023 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -28,8 +28,8 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> -static struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly; -struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly; +static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly; +struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_l3protos); static DEFINE_MUTEX(nf_ct_proto_mutex); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index eb973fcd67ab..0fb65705b44b 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -15,6 +15,7 @@ #include <linux/seq_file.h> #include <linux/percpu.h> #include <linux/netdevice.h> +#include <linux/security.h> #include <net/net_namespace.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> @@ -108,6 +109,29 @@ static void ct_seq_stop(struct seq_file *s, void *v) rcu_read_unlock(); } +#ifdef CONFIG_NF_CONNTRACK_SECMARK +static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +{ + int ret; + u32 len; + char *secctx; + + ret = security_secid_to_secctx(ct->secmark, &secctx, &len); + if (ret) + return ret; + + ret = seq_printf(s, "secctx=%s ", secctx); + + security_release_secctx(secctx, len); + return ret; +} +#else +static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +{ + return 0; +} +#endif + /* return 0 on success, 1 in case of error */ static int ct_seq_show(struct seq_file *s, void *v) { @@ -168,10 +192,8 @@ static int ct_seq_show(struct seq_file *s, void *v) goto release; #endif -#ifdef CONFIG_NF_CONNTRACK_SECMARK - if (seq_printf(s, "secmark=%u ", ct->secmark)) + if (ct_show_secctx(s, ct)) goto release; -#endif #ifdef CONFIG_NF_CONNTRACK_ZONES if (seq_printf(s, "zone=%u ", nf_ct_zone(ct))) diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 7df37fd786bc..b07393eab88e 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -16,7 +16,7 @@ #define NF_LOG_PREFIXLEN 128 #define NFLOGGER_NAME_LEN 64 -static const struct nf_logger *nf_loggers[NFPROTO_NUMPROTO] __read_mostly; +static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly; static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly; static DEFINE_MUTEX(nf_log_mutex); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 78b3cf9c519c..74aebed5bd28 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -18,7 +18,7 @@ * long term mutex. The handler must provide an an outfn() to accept packets * for queueing and must reinject all packets it receives, no matter what. */ -static const struct nf_queue_handler *queue_handler[NFPROTO_NUMPROTO] __read_mostly; +static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly; static DEFINE_MUTEX(queue_handler_mutex); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 0cb6053f02fd..782e51986a6f 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -9,7 +9,6 @@ #include <linux/module.h> #include <linux/gfp.h> #include <linux/skbuff.h> -#include <linux/selinux.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 23b2d6c486b5..9faf5e050b79 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -14,8 +14,8 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> +#include <linux/security.h> #include <linux/skbuff.h> -#include <linux/selinux.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SECMARK.h> @@ -39,9 +39,8 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par) switch (mode) { case SECMARK_MODE_SEL: - secmark = info->u.sel.selsid; + secmark = info->secid; break; - default: BUG(); } @@ -50,33 +49,33 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static int checkentry_selinux(struct xt_secmark_target_info *info) +static int checkentry_lsm(struct xt_secmark_target_info *info) { int err; - struct xt_secmark_target_selinux_info *sel = &info->u.sel; - sel->selctx[SECMARK_SELCTX_MAX - 1] = '\0'; + info->secctx[SECMARK_SECCTX_MAX - 1] = '\0'; + info->secid = 0; - err = selinux_string_to_sid(sel->selctx, &sel->selsid); + err = security_secctx_to_secid(info->secctx, strlen(info->secctx), + &info->secid); if (err) { if (err == -EINVAL) - pr_info("invalid SELinux context \'%s\'\n", - sel->selctx); + pr_info("invalid security context \'%s\'\n", info->secctx); return err; } - if (!sel->selsid) { - pr_info("unable to map SELinux context \'%s\'\n", sel->selctx); + if (!info->secid) { + pr_info("unable to map security context \'%s\'\n", info->secctx); return -ENOENT; } - err = selinux_secmark_relabel_packet_permission(sel->selsid); + err = security_secmark_relabel_packet(info->secid); if (err) { pr_info("unable to obtain relabeling permission\n"); return err; } - selinux_secmark_refcount_inc(); + security_secmark_refcount_inc(); return 0; } @@ -100,16 +99,16 @@ static int secmark_tg_check(const struct xt_tgchk_param *par) switch (info->mode) { case SECMARK_MODE_SEL: - err = checkentry_selinux(info); - if (err <= 0) - return err; break; - default: pr_info("invalid mode: %hu\n", info->mode); return -EINVAL; } + err = checkentry_lsm(info); + if (err) + return err; + if (!mode) mode = info->mode; return 0; @@ -119,7 +118,7 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par) { switch (mode) { case SECMARK_MODE_SEL: - selinux_secmark_refcount_dec(); + security_secmark_refcount_dec(); } } diff --git a/net/rds/page.c b/net/rds/page.c index 595a952d4b17..1dfbfea12e9b 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -57,30 +57,17 @@ int rds_page_copy_user(struct page *page, unsigned long offset, unsigned long ret; void *addr; - if (to_user) + addr = kmap(page); + if (to_user) { rds_stats_add(s_copy_to_user, bytes); - else + ret = copy_to_user(ptr, addr + offset, bytes); + } else { rds_stats_add(s_copy_from_user, bytes); - - addr = kmap_atomic(page, KM_USER0); - if (to_user) - ret = __copy_to_user_inatomic(ptr, addr + offset, bytes); - else - ret = __copy_from_user_inatomic(addr + offset, ptr, bytes); - kunmap_atomic(addr, KM_USER0); - - if (ret) { - addr = kmap(page); - if (to_user) - ret = copy_to_user(ptr, addr + offset, bytes); - else - ret = copy_from_user(addr + offset, ptr, bytes); - kunmap(page); - if (ret) - return -EFAULT; + ret = copy_from_user(addr + offset, ptr, bytes); } + kunmap(page); - return 0; + return ret ? -EFAULT : 0; } EXPORT_SYMBOL_GPL(rds_page_copy_user); diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 7416a5c73b2a..b0c2a82178af 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -137,7 +137,7 @@ next_knode: int toff = off + key->off + (off2 & key->offmask); __be32 *data, _data; - if (skb_headroom(skb) + toff < 0) + if (skb_headroom(skb) + toff > INT_MAX) goto out; data = skb_header_pointer(skb, toff, 4, &_data); diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 86366390038a..ddbbf7c81fa1 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -543,16 +543,20 @@ struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) id = ntohs(hmacs->hmac_ids[i]); /* Check the id is in the supported range */ - if (id > SCTP_AUTH_HMAC_ID_MAX) + if (id > SCTP_AUTH_HMAC_ID_MAX) { + id = 0; continue; + } /* See is we support the id. Supported IDs have name and * length fields set, so that we can allocated and use * them. We can safely just check for name, for without the * name, we can't allocate the TFM. */ - if (!sctp_hmac_list[id].hmac_name) + if (!sctp_hmac_list[id].hmac_name) { + id = 0; continue; + } break; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ca44917872d2..fbb70770ad05 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -916,6 +916,11 @@ SCTP_STATIC int sctp_setsockopt_bindx(struct sock* sk, /* Walk through the addrs buffer and count the number of addresses. */ addr_buf = kaddrs; while (walk_size < addrs_size) { + if (walk_size + sizeof(sa_family_t) > addrs_size) { + kfree(kaddrs); + return -EINVAL; + } + sa_addr = (struct sockaddr *)addr_buf; af = sctp_get_af_specific(sa_addr->sa_family); @@ -1002,9 +1007,13 @@ static int __sctp_connect(struct sock* sk, /* Walk through the addrs buffer and count the number of addresses. */ addr_buf = kaddrs; while (walk_size < addrs_size) { + if (walk_size + sizeof(sa_family_t) > addrs_size) { + err = -EINVAL; + goto out_free; + } + sa_addr = (union sctp_addr *)addr_buf; af = sctp_get_af_specific(sa_addr->sa.sa_family); - port = ntohs(sa_addr->v4.sin_port); /* If the address family is not supported or if this address * causes the address buffer to overflow return EINVAL. @@ -1014,6 +1023,8 @@ static int __sctp_connect(struct sock* sk, goto out_free; } + port = ntohs(sa_addr->v4.sin_port); + /* Save current address so we can work with it */ memcpy(&to, sa_addr, af->sockaddr_len); diff --git a/scripts/kconfig/conf.c b/scripts/kconfig/conf.c index 5b7c86ea43a1..7ef429cd5cb3 100644 --- a/scripts/kconfig/conf.c +++ b/scripts/kconfig/conf.c @@ -427,7 +427,7 @@ static void check_conf(struct menu *menu) if (sym->name && !sym_is_choice_value(sym)) { printf("CONFIG_%s\n", sym->name); } - } else { + } else if (input_mode != oldnoconfig) { if (!conf_cnt++) printf(_("*\n* Restart config...\n*\n")); rootEntry = menu_get_parent_menu(menu); diff --git a/scripts/kconfig/expr.h b/scripts/kconfig/expr.h index 6ee2e4fb1481..170459c224a1 100644 --- a/scripts/kconfig/expr.h +++ b/scripts/kconfig/expr.h @@ -165,7 +165,6 @@ struct menu { struct symbol *sym; struct property *prompt; struct expr *dep; - struct expr *dir_dep; unsigned int flags; char *help; struct file *file; diff --git a/scripts/kconfig/menu.c b/scripts/kconfig/menu.c index 4fb590247f33..edda8b49619d 100644 --- a/scripts/kconfig/menu.c +++ b/scripts/kconfig/menu.c @@ -107,7 +107,6 @@ static struct expr *menu_check_dep(struct expr *e) void menu_add_dep(struct expr *dep) { current_entry->dep = expr_alloc_and(current_entry->dep, menu_check_dep(dep)); - current_entry->dir_dep = current_entry->dep; } void menu_set_type(int type) @@ -291,10 +290,6 @@ void menu_finalize(struct menu *parent) for (menu = parent->list; menu; menu = menu->next) menu_finalize(menu); } else if (sym) { - /* ignore inherited dependencies for dir_dep */ - sym->dir_dep.expr = expr_transform(expr_copy(parent->dir_dep)); - sym->dir_dep.expr = expr_eliminate_dups(sym->dir_dep.expr); - basedep = parent->prompt ? parent->prompt->visible.expr : NULL; basedep = expr_trans_compare(basedep, E_UNEQUAL, &symbol_no); basedep = expr_eliminate_dups(expr_transform(basedep)); @@ -325,6 +320,8 @@ void menu_finalize(struct menu *parent) parent->next = last_menu->next; last_menu->next = NULL; } + + sym->dir_dep.expr = parent->dep; } for (menu = parent->list; menu; menu = menu->next) { if (sym && sym_is_choice(sym) && diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c index 943712ca6c0a..1f8b305449db 100644 --- a/scripts/kconfig/symbol.c +++ b/scripts/kconfig/symbol.c @@ -350,6 +350,7 @@ void sym_calc_value(struct symbol *sym) } } calc_newval: +#if 0 if (sym->dir_dep.tri == no && sym->rev_dep.tri != no) { fprintf(stderr, "warning: ("); expr_fprint(sym->rev_dep.expr, stderr); @@ -358,6 +359,7 @@ void sym_calc_value(struct symbol *sym) expr_fprint(sym->dir_dep.expr, stderr); fprintf(stderr, ")\n"); } +#endif newval.tri = EXPR_OR(newval.tri, sym->rev_dep.tri); } if (newval.tri == mod && sym_get_type(sym) == S_BOOLEAN) diff --git a/security/apparmor/.gitignore b/security/apparmor/.gitignore index 0a0a99f3b083..4d995aeaebc0 100644 --- a/security/apparmor/.gitignore +++ b/security/apparmor/.gitignore @@ -3,3 +3,4 @@ # af_names.h capability_names.h +rlim_names.h diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 7320331b44ab..544ff5837cb6 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -29,7 +29,7 @@ * aa_simple_write_to_buffer - common routine for getting policy from user * @op: operation doing the user buffer copy * @userbuf: user buffer to copy data from (NOT NULL) - * @alloc_size: size of user buffer + * @alloc_size: size of user buffer (REQUIRES: @alloc_size >= @copy_size) * @copy_size: size of data to copy from user buffer * @pos: position write is at in the file (NOT NULL) * @@ -42,6 +42,8 @@ static char *aa_simple_write_to_buffer(int op, const char __user *userbuf, { char *data; + BUG_ON(copy_size > alloc_size); + if (*pos != 0) /* only writes from pos 0, that is complete writes */ return ERR_PTR(-ESPIPE); diff --git a/security/capability.c b/security/capability.c index 95a6599a37bb..30ae00fbecd5 100644 --- a/security/capability.c +++ b/security/capability.c @@ -677,7 +677,18 @@ static void cap_inet_conn_established(struct sock *sk, struct sk_buff *skb) { } +static int cap_secmark_relabel_packet(u32 secid) +{ + return 0; +} +static void cap_secmark_refcount_inc(void) +{ +} + +static void cap_secmark_refcount_dec(void) +{ +} static void cap_req_classify_flow(const struct request_sock *req, struct flowi *fl) @@ -777,7 +788,8 @@ static int cap_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) static int cap_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) { - return -EOPNOTSUPP; + *secid = 0; + return 0; } static void cap_release_secctx(char *secdata, u32 seclen) @@ -1018,6 +1030,9 @@ void __init security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, inet_conn_request); set_to_cap_if_null(ops, inet_csk_clone); set_to_cap_if_null(ops, inet_conn_established); + set_to_cap_if_null(ops, secmark_relabel_packet); + set_to_cap_if_null(ops, secmark_refcount_inc); + set_to_cap_if_null(ops, secmark_refcount_dec); set_to_cap_if_null(ops, req_classify_flow); set_to_cap_if_null(ops, tun_dev_create); set_to_cap_if_null(ops, tun_dev_post_create); diff --git a/security/commoncap.c b/security/commoncap.c index 9d172e6e330c..5e632b4857e4 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -719,14 +719,11 @@ static int cap_safe_nice(struct task_struct *p) /** * cap_task_setscheduler - Detemine if scheduler policy change is permitted * @p: The task to affect - * @policy: The policy to effect - * @lp: The parameters to the scheduling policy * * Detemine if the requested scheduler policy change is permitted for the * specified task, returning 0 if permission is granted, -ve if denied. */ -int cap_task_setscheduler(struct task_struct *p, int policy, - struct sched_param *lp) +int cap_task_setscheduler(struct task_struct *p) { return cap_safe_nice(p); } diff --git a/security/security.c b/security/security.c index c53949f17d9e..b50f472061a4 100644 --- a/security/security.c +++ b/security/security.c @@ -89,20 +89,12 @@ __setup("security=", choose_lsm); * Return true if: * -The passed LSM is the one chosen by user at boot time, * -or the passed LSM is configured as the default and the user did not - * choose an alternate LSM at boot time, - * -or there is no default LSM set and the user didn't specify a - * specific LSM and we're the first to ask for registration permission, - * -or the passed LSM is currently loaded. + * choose an alternate LSM at boot time. * Otherwise, return false. */ int __init security_module_enable(struct security_operations *ops) { - if (!*chosen_lsm) - strncpy(chosen_lsm, ops->name, SECURITY_NAME_MAX); - else if (strncmp(ops->name, chosen_lsm, SECURITY_NAME_MAX)) - return 0; - - return 1; + return !strcmp(ops->name, chosen_lsm); } /** @@ -786,10 +778,9 @@ int security_task_setrlimit(struct task_struct *p, unsigned int resource, return security_ops->task_setrlimit(p, resource, new_rlim); } -int security_task_setscheduler(struct task_struct *p, - int policy, struct sched_param *lp) +int security_task_setscheduler(struct task_struct *p) { - return security_ops->task_setscheduler(p, policy, lp); + return security_ops->task_setscheduler(p); } int security_task_getscheduler(struct task_struct *p) @@ -1145,6 +1136,24 @@ void security_inet_conn_established(struct sock *sk, security_ops->inet_conn_established(sk, skb); } +int security_secmark_relabel_packet(u32 secid) +{ + return security_ops->secmark_relabel_packet(secid); +} +EXPORT_SYMBOL(security_secmark_relabel_packet); + +void security_secmark_refcount_inc(void) +{ + security_ops->secmark_refcount_inc(); +} +EXPORT_SYMBOL(security_secmark_refcount_inc); + +void security_secmark_refcount_dec(void) +{ + security_ops->secmark_refcount_dec(); +} +EXPORT_SYMBOL(security_secmark_refcount_dec); + int security_tun_dev_create(void) { return security_ops->tun_dev_create(); diff --git a/security/selinux/Makefile b/security/selinux/Makefile index 58d80f3bd6f6..ad5cd76ec231 100644 --- a/security/selinux/Makefile +++ b/security/selinux/Makefile @@ -2,25 +2,20 @@ # Makefile for building the SELinux module as part of the kernel tree. # -obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/ - -selinux-y := avc.o \ - hooks.o \ - selinuxfs.o \ - netlink.o \ - nlmsgtab.o \ - netif.o \ - netnode.o \ - netport.o \ - exports.o +obj-$(CONFIG_SECURITY_SELINUX) := selinux.o + +selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o \ + netnode.o netport.o exports.o \ + ss/ebitmap.o ss/hashtab.o ss/symtab.o ss/sidtab.o ss/avtab.o \ + ss/policydb.o ss/services.o ss/conditional.o ss/mls.o ss/status.o selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o selinux-$(CONFIG_NETLABEL) += netlabel.o -EXTRA_CFLAGS += -Isecurity/selinux -Isecurity/selinux/include +ccflags-y := -Isecurity/selinux -Isecurity/selinux/include -$(obj)/avc.o: $(obj)/flask.h +$(addprefix $(obj)/,$(selinux-y)): $(obj)/flask.h quiet_cmd_flask = GEN $(obj)/flask.h $(obj)/av_permissions.h cmd_flask = scripts/selinux/genheaders/genheaders $(obj)/flask.h $(obj)/av_permissions.h diff --git a/security/selinux/exports.c b/security/selinux/exports.c index c0a454aee1e0..90664385dead 100644 --- a/security/selinux/exports.c +++ b/security/selinux/exports.c @@ -11,58 +11,9 @@ * it under the terms of the GNU General Public License version 2, * as published by the Free Software Foundation. */ -#include <linux/types.h> -#include <linux/kernel.h> #include <linux/module.h> -#include <linux/selinux.h> -#include <linux/fs.h> -#include <linux/ipc.h> -#include <asm/atomic.h> #include "security.h" -#include "objsec.h" - -/* SECMARK reference count */ -extern atomic_t selinux_secmark_refcount; - -int selinux_string_to_sid(char *str, u32 *sid) -{ - if (selinux_enabled) - return security_context_to_sid(str, strlen(str), sid); - else { - *sid = 0; - return 0; - } -} -EXPORT_SYMBOL_GPL(selinux_string_to_sid); - -int selinux_secmark_relabel_packet_permission(u32 sid) -{ - if (selinux_enabled) { - const struct task_security_struct *__tsec; - u32 tsid; - - __tsec = current_security(); - tsid = __tsec->sid; - - return avc_has_perm(tsid, sid, SECCLASS_PACKET, - PACKET__RELABELTO, NULL); - } - return 0; -} -EXPORT_SYMBOL_GPL(selinux_secmark_relabel_packet_permission); - -void selinux_secmark_refcount_inc(void) -{ - atomic_inc(&selinux_secmark_refcount); -} -EXPORT_SYMBOL_GPL(selinux_secmark_refcount_inc); - -void selinux_secmark_refcount_dec(void) -{ - atomic_dec(&selinux_secmark_refcount); -} -EXPORT_SYMBOL_GPL(selinux_secmark_refcount_dec); bool selinux_is_enabled(void) { diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 4796ddd4e721..d9154cf90ae1 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3354,11 +3354,11 @@ static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource, return 0; } -static int selinux_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp) +static int selinux_task_setscheduler(struct task_struct *p) { int rc; - rc = cap_task_setscheduler(p, policy, lp); + rc = cap_task_setscheduler(p); if (rc) return rc; @@ -4279,6 +4279,27 @@ static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb) selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid); } +static int selinux_secmark_relabel_packet(u32 sid) +{ + const struct task_security_struct *__tsec; + u32 tsid; + + __tsec = current_security(); + tsid = __tsec->sid; + + return avc_has_perm(tsid, sid, SECCLASS_PACKET, PACKET__RELABELTO, NULL); +} + +static void selinux_secmark_refcount_inc(void) +{ + atomic_inc(&selinux_secmark_refcount); +} + +static void selinux_secmark_refcount_dec(void) +{ + atomic_dec(&selinux_secmark_refcount); +} + static void selinux_req_classify_flow(const struct request_sock *req, struct flowi *fl) { @@ -5533,6 +5554,9 @@ static struct security_operations selinux_ops = { .inet_conn_request = selinux_inet_conn_request, .inet_csk_clone = selinux_inet_csk_clone, .inet_conn_established = selinux_inet_conn_established, + .secmark_relabel_packet = selinux_secmark_relabel_packet, + .secmark_refcount_inc = selinux_secmark_refcount_inc, + .secmark_refcount_dec = selinux_secmark_refcount_dec, .req_classify_flow = selinux_req_classify_flow, .tun_dev_create = selinux_tun_dev_create, .tun_dev_post_create = selinux_tun_dev_post_create, diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index b4c9eb4bd6f9..8858d2b2d4b6 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -17,7 +17,7 @@ struct security_class_mapping secclass_map[] = { { "compute_av", "compute_create", "compute_member", "check_context", "load_policy", "compute_relabel", "compute_user", "setenforce", "setbool", "setsecparam", - "setcheckreqprot", NULL } }, + "setcheckreqprot", "read_policy", NULL } }, { "process", { "fork", "transition", "sigchld", "sigkill", "sigstop", "signull", "signal", "ptrace", "getsched", "setsched", diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 1f7c2491d3dc..671273eb1115 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -9,6 +9,7 @@ #define _SELINUX_SECURITY_H_ #include <linux/magic.h> +#include <linux/types.h> #include "flask.h" #define SECSID_NULL 0x00000000 /* unspecified SID */ @@ -82,6 +83,8 @@ extern int selinux_policycap_openperm; int security_mls_enabled(void); int security_load_policy(void *data, size_t len); +int security_read_policy(void **data, ssize_t *len); +size_t security_policydb_len(void); int security_policycap_supported(unsigned int req_cap); @@ -191,5 +194,25 @@ static inline int security_netlbl_sid_to_secattr(u32 sid, const char *security_get_initial_sid_context(u32 sid); +/* + * status notifier using mmap interface + */ +extern struct page *selinux_kernel_status_page(void); + +#define SELINUX_KERNEL_STATUS_VERSION 1 +struct selinux_kernel_status { + u32 version; /* version number of thie structure */ + u32 sequence; /* sequence number of seqlock logic */ + u32 enforcing; /* current setting of enforcing mode */ + u32 policyload; /* times of policy reloaded */ + u32 deny_unknown; /* current setting of deny_unknown */ + /* + * The version > 0 supports above members. + */ +} __attribute__((packed)); + +extern void selinux_status_update_setenforce(int enforcing); +extern void selinux_status_update_policyload(int seqno); + #endif /* _SELINUX_SECURITY_H_ */ diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 79a1bb635662..87e0556bae70 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -68,6 +68,8 @@ static int *bool_pending_values; static struct dentry *class_dir; static unsigned long last_class_ino; +static char policy_opened; + /* global data for policy capabilities */ static struct dentry *policycap_dir; @@ -110,6 +112,8 @@ enum sel_inos { SEL_COMPAT_NET, /* whether to use old compat network packet controls */ SEL_REJECT_UNKNOWN, /* export unknown reject handling to userspace */ SEL_DENY_UNKNOWN, /* export unknown deny handling to userspace */ + SEL_STATUS, /* export current status using mmap() */ + SEL_POLICY, /* allow userspace to read the in kernel policy */ SEL_INO_NEXT, /* The next inode number to use */ }; @@ -171,6 +175,7 @@ static ssize_t sel_write_enforce(struct file *file, const char __user *buf, if (selinux_enforcing) avc_ss_reset(0); selnl_notify_setenforce(selinux_enforcing); + selinux_status_update_setenforce(selinux_enforcing); } length = count; out: @@ -205,6 +210,59 @@ static const struct file_operations sel_handle_unknown_ops = { .llseek = generic_file_llseek, }; +static int sel_open_handle_status(struct inode *inode, struct file *filp) +{ + struct page *status = selinux_kernel_status_page(); + + if (!status) + return -ENOMEM; + + filp->private_data = status; + + return 0; +} + +static ssize_t sel_read_handle_status(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct page *status = filp->private_data; + + BUG_ON(!status); + + return simple_read_from_buffer(buf, count, ppos, + page_address(status), + sizeof(struct selinux_kernel_status)); +} + +static int sel_mmap_handle_status(struct file *filp, + struct vm_area_struct *vma) +{ + struct page *status = filp->private_data; + unsigned long size = vma->vm_end - vma->vm_start; + + BUG_ON(!status); + + /* only allows one page from the head */ + if (vma->vm_pgoff > 0 || size != PAGE_SIZE) + return -EIO; + /* disallow writable mapping */ + if (vma->vm_flags & VM_WRITE) + return -EPERM; + /* disallow mprotect() turns it into writable */ + vma->vm_flags &= ~VM_MAYWRITE; + + return remap_pfn_range(vma, vma->vm_start, + page_to_pfn(status), + size, vma->vm_page_prot); +} + +static const struct file_operations sel_handle_status_ops = { + .open = sel_open_handle_status, + .read = sel_read_handle_status, + .mmap = sel_mmap_handle_status, + .llseek = generic_file_llseek, +}; + #ifdef CONFIG_SECURITY_SELINUX_DISABLE static ssize_t sel_write_disable(struct file *file, const char __user *buf, size_t count, loff_t *ppos) @@ -296,6 +354,141 @@ static const struct file_operations sel_mls_ops = { .llseek = generic_file_llseek, }; +struct policy_load_memory { + size_t len; + void *data; +}; + +static int sel_open_policy(struct inode *inode, struct file *filp) +{ + struct policy_load_memory *plm = NULL; + int rc; + + BUG_ON(filp->private_data); + + mutex_lock(&sel_mutex); + + rc = task_has_security(current, SECURITY__READ_POLICY); + if (rc) + goto err; + + rc = -EBUSY; + if (policy_opened) + goto err; + + rc = -ENOMEM; + plm = kzalloc(sizeof(*plm), GFP_KERNEL); + if (!plm) + goto err; + + if (i_size_read(inode) != security_policydb_len()) { + mutex_lock(&inode->i_mutex); + i_size_write(inode, security_policydb_len()); + mutex_unlock(&inode->i_mutex); + } + + rc = security_read_policy(&plm->data, &plm->len); + if (rc) + goto err; + + policy_opened = 1; + + filp->private_data = plm; + + mutex_unlock(&sel_mutex); + + return 0; +err: + mutex_unlock(&sel_mutex); + + if (plm) + vfree(plm->data); + kfree(plm); + return rc; +} + +static int sel_release_policy(struct inode *inode, struct file *filp) +{ + struct policy_load_memory *plm = filp->private_data; + + BUG_ON(!plm); + + policy_opened = 0; + + vfree(plm->data); + kfree(plm); + + return 0; +} + +static ssize_t sel_read_policy(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct policy_load_memory *plm = filp->private_data; + int ret; + + mutex_lock(&sel_mutex); + + ret = task_has_security(current, SECURITY__READ_POLICY); + if (ret) + goto out; + + ret = simple_read_from_buffer(buf, count, ppos, plm->data, plm->len); +out: + mutex_unlock(&sel_mutex); + return ret; +} + +static int sel_mmap_policy_fault(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct policy_load_memory *plm = vma->vm_file->private_data; + unsigned long offset; + struct page *page; + + if (vmf->flags & (FAULT_FLAG_MKWRITE | FAULT_FLAG_WRITE)) + return VM_FAULT_SIGBUS; + + offset = vmf->pgoff << PAGE_SHIFT; + if (offset >= roundup(plm->len, PAGE_SIZE)) + return VM_FAULT_SIGBUS; + + page = vmalloc_to_page(plm->data + offset); + get_page(page); + + vmf->page = page; + + return 0; +} + +static struct vm_operations_struct sel_mmap_policy_ops = { + .fault = sel_mmap_policy_fault, + .page_mkwrite = sel_mmap_policy_fault, +}; + +int sel_mmap_policy(struct file *filp, struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHARED) { + /* do not allow mprotect to make mapping writable */ + vma->vm_flags &= ~VM_MAYWRITE; + + if (vma->vm_flags & VM_WRITE) + return -EACCES; + } + + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &sel_mmap_policy_ops; + + return 0; +} + +static const struct file_operations sel_policy_ops = { + .open = sel_open_policy, + .read = sel_read_policy, + .mmap = sel_mmap_policy, + .release = sel_release_policy, +}; + static ssize_t sel_write_load(struct file *file, const char __user *buf, size_t count, loff_t *ppos) @@ -1612,6 +1805,8 @@ static int sel_fill_super(struct super_block *sb, void *data, int silent) [SEL_CHECKREQPROT] = {"checkreqprot", &sel_checkreqprot_ops, S_IRUGO|S_IWUSR}, [SEL_REJECT_UNKNOWN] = {"reject_unknown", &sel_handle_unknown_ops, S_IRUGO}, [SEL_DENY_UNKNOWN] = {"deny_unknown", &sel_handle_unknown_ops, S_IRUGO}, + [SEL_STATUS] = {"status", &sel_handle_status_ops, S_IRUGO}, + [SEL_POLICY] = {"policy", &sel_policy_ops, S_IRUSR}, /* last one */ {""} }; ret = simple_fill_super(sb, SELINUX_MAGIC, selinux_files); diff --git a/security/selinux/ss/Makefile b/security/selinux/ss/Makefile deleted file mode 100644 index 15d4e62917de..000000000000 --- a/security/selinux/ss/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# -# Makefile for building the SELinux security server as part of the kernel tree. -# - -EXTRA_CFLAGS += -Isecurity/selinux -Isecurity/selinux/include -obj-y := ss.o - -ss-y := ebitmap.o hashtab.o symtab.o sidtab.o avtab.o policydb.o services.o conditional.o mls.o - diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c index 929480c6c430..a3dd9faa19c0 100644 --- a/security/selinux/ss/avtab.c +++ b/security/selinux/ss/avtab.c @@ -266,8 +266,8 @@ int avtab_alloc(struct avtab *h, u32 nrules) if (shift > 2) shift = shift - 2; nslot = 1 << shift; - if (nslot > MAX_AVTAB_SIZE) - nslot = MAX_AVTAB_SIZE; + if (nslot > MAX_AVTAB_HASH_BUCKETS) + nslot = MAX_AVTAB_HASH_BUCKETS; mask = nslot - 1; h->htable = kcalloc(nslot, sizeof(*(h->htable)), GFP_KERNEL); @@ -501,6 +501,48 @@ bad: goto out; } +int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp) +{ + __le16 buf16[4]; + __le32 buf32[1]; + int rc; + + buf16[0] = cpu_to_le16(cur->key.source_type); + buf16[1] = cpu_to_le16(cur->key.target_type); + buf16[2] = cpu_to_le16(cur->key.target_class); + buf16[3] = cpu_to_le16(cur->key.specified); + rc = put_entry(buf16, sizeof(u16), 4, fp); + if (rc) + return rc; + buf32[0] = cpu_to_le32(cur->datum.data); + rc = put_entry(buf32, sizeof(u32), 1, fp); + if (rc) + return rc; + return 0; +} + +int avtab_write(struct policydb *p, struct avtab *a, void *fp) +{ + unsigned int i; + int rc = 0; + struct avtab_node *cur; + __le32 buf[1]; + + buf[0] = cpu_to_le32(a->nel); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + for (i = 0; i < a->nslot; i++) { + for (cur = a->htable[i]; cur; cur = cur->next) { + rc = avtab_write_item(p, cur, fp); + if (rc) + return rc; + } + } + + return rc; +} void avtab_cache_init(void) { avtab_node_cachep = kmem_cache_create("avtab_node", diff --git a/security/selinux/ss/avtab.h b/security/selinux/ss/avtab.h index cd4f734e2749..dff0c75345c1 100644 --- a/security/selinux/ss/avtab.h +++ b/security/selinux/ss/avtab.h @@ -71,6 +71,8 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol, void *p); int avtab_read(struct avtab *a, void *fp, struct policydb *pol); +int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp); +int avtab_write(struct policydb *p, struct avtab *a, void *fp); struct avtab_node *avtab_insert_nonunique(struct avtab *h, struct avtab_key *key, struct avtab_datum *datum); @@ -85,7 +87,6 @@ void avtab_cache_destroy(void); #define MAX_AVTAB_HASH_BITS 11 #define MAX_AVTAB_HASH_BUCKETS (1 << MAX_AVTAB_HASH_BITS) #define MAX_AVTAB_HASH_MASK (MAX_AVTAB_HASH_BUCKETS-1) -#define MAX_AVTAB_SIZE MAX_AVTAB_HASH_BUCKETS #endif /* _SS_AVTAB_H_ */ diff --git a/security/selinux/ss/conditional.c b/security/selinux/ss/conditional.c index c91e150c3087..655fe1c6cc69 100644 --- a/security/selinux/ss/conditional.c +++ b/security/selinux/ss/conditional.c @@ -490,6 +490,129 @@ err: return rc; } +int cond_write_bool(void *vkey, void *datum, void *ptr) +{ + char *key = vkey; + struct cond_bool_datum *booldatum = datum; + struct policy_data *pd = ptr; + void *fp = pd->fp; + __le32 buf[3]; + u32 len; + int rc; + + len = strlen(key); + buf[0] = cpu_to_le32(booldatum->value); + buf[1] = cpu_to_le32(booldatum->state); + buf[2] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 3, fp); + if (rc) + return rc; + rc = put_entry(key, 1, len, fp); + if (rc) + return rc; + return 0; +} + +/* + * cond_write_cond_av_list doesn't write out the av_list nodes. + * Instead it writes out the key/value pairs from the avtab. This + * is necessary because there is no way to uniquely identifying rules + * in the avtab so it is not possible to associate individual rules + * in the avtab with a conditional without saving them as part of + * the conditional. This means that the avtab with the conditional + * rules will not be saved but will be rebuilt on policy load. + */ +static int cond_write_av_list(struct policydb *p, + struct cond_av_list *list, struct policy_file *fp) +{ + __le32 buf[1]; + struct cond_av_list *cur_list; + u32 len; + int rc; + + len = 0; + for (cur_list = list; cur_list != NULL; cur_list = cur_list->next) + len++; + + buf[0] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + if (len == 0) + return 0; + + for (cur_list = list; cur_list != NULL; cur_list = cur_list->next) { + rc = avtab_write_item(p, cur_list->node, fp); + if (rc) + return rc; + } + + return 0; +} + +int cond_write_node(struct policydb *p, struct cond_node *node, + struct policy_file *fp) +{ + struct cond_expr *cur_expr; + __le32 buf[2]; + int rc; + u32 len = 0; + + buf[0] = cpu_to_le32(node->cur_state); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + for (cur_expr = node->expr; cur_expr != NULL; cur_expr = cur_expr->next) + len++; + + buf[0] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + for (cur_expr = node->expr; cur_expr != NULL; cur_expr = cur_expr->next) { + buf[0] = cpu_to_le32(cur_expr->expr_type); + buf[1] = cpu_to_le32(cur_expr->bool); + rc = put_entry(buf, sizeof(u32), 2, fp); + if (rc) + return rc; + } + + rc = cond_write_av_list(p, node->true_list, fp); + if (rc) + return rc; + rc = cond_write_av_list(p, node->false_list, fp); + if (rc) + return rc; + + return 0; +} + +int cond_write_list(struct policydb *p, struct cond_node *list, void *fp) +{ + struct cond_node *cur; + u32 len; + __le32 buf[1]; + int rc; + + len = 0; + for (cur = list; cur != NULL; cur = cur->next) + len++; + buf[0] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + for (cur = list; cur != NULL; cur = cur->next) { + rc = cond_write_node(p, cur, fp); + if (rc) + return rc; + } + + return 0; +} /* Determine whether additional permissions are granted by the conditional * av table, and if so, add them to the result */ diff --git a/security/selinux/ss/conditional.h b/security/selinux/ss/conditional.h index 53ddb013ae57..3f209c635295 100644 --- a/security/selinux/ss/conditional.h +++ b/security/selinux/ss/conditional.h @@ -69,6 +69,8 @@ int cond_index_bool(void *key, void *datum, void *datap); int cond_read_bool(struct policydb *p, struct hashtab *h, void *fp); int cond_read_list(struct policydb *p, void *fp); +int cond_write_bool(void *key, void *datum, void *ptr); +int cond_write_list(struct policydb *p, struct cond_node *list, void *fp); void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd); diff --git a/security/selinux/ss/ebitmap.c b/security/selinux/ss/ebitmap.c index 04b6145d767f..d42951fcbe87 100644 --- a/security/selinux/ss/ebitmap.c +++ b/security/selinux/ss/ebitmap.c @@ -22,6 +22,8 @@ #include "ebitmap.h" #include "policydb.h" +#define BITS_PER_U64 (sizeof(u64) * 8) + int ebitmap_cmp(struct ebitmap *e1, struct ebitmap *e2) { struct ebitmap_node *n1, *n2; @@ -363,10 +365,10 @@ int ebitmap_read(struct ebitmap *e, void *fp) e->highbit = le32_to_cpu(buf[1]); count = le32_to_cpu(buf[2]); - if (mapunit != sizeof(u64) * 8) { + if (mapunit != BITS_PER_U64) { printk(KERN_ERR "SELinux: ebitmap: map size %u does not " "match my size %Zd (high bit was %d)\n", - mapunit, sizeof(u64) * 8, e->highbit); + mapunit, BITS_PER_U64, e->highbit); goto bad; } @@ -446,3 +448,78 @@ bad: ebitmap_destroy(e); goto out; } + +int ebitmap_write(struct ebitmap *e, void *fp) +{ + struct ebitmap_node *n; + u32 count; + __le32 buf[3]; + u64 map; + int bit, last_bit, last_startbit, rc; + + buf[0] = cpu_to_le32(BITS_PER_U64); + + count = 0; + last_bit = 0; + last_startbit = -1; + ebitmap_for_each_positive_bit(e, n, bit) { + if (rounddown(bit, (int)BITS_PER_U64) > last_startbit) { + count++; + last_startbit = rounddown(bit, BITS_PER_U64); + } + last_bit = roundup(bit + 1, BITS_PER_U64); + } + buf[1] = cpu_to_le32(last_bit); + buf[2] = cpu_to_le32(count); + + rc = put_entry(buf, sizeof(u32), 3, fp); + if (rc) + return rc; + + map = 0; + last_startbit = INT_MIN; + ebitmap_for_each_positive_bit(e, n, bit) { + if (rounddown(bit, (int)BITS_PER_U64) > last_startbit) { + __le64 buf64[1]; + + /* this is the very first bit */ + if (!map) { + last_startbit = rounddown(bit, BITS_PER_U64); + map = (u64)1 << (bit - last_startbit); + continue; + } + + /* write the last node */ + buf[0] = cpu_to_le32(last_startbit); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + buf64[0] = cpu_to_le64(map); + rc = put_entry(buf64, sizeof(u64), 1, fp); + if (rc) + return rc; + + /* set up for the next node */ + map = 0; + last_startbit = rounddown(bit, BITS_PER_U64); + } + map |= (u64)1 << (bit - last_startbit); + } + /* write the last node */ + if (map) { + __le64 buf64[1]; + + /* write the last node */ + buf[0] = cpu_to_le32(last_startbit); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + buf64[0] = cpu_to_le64(map); + rc = put_entry(buf64, sizeof(u64), 1, fp); + if (rc) + return rc; + } + return 0; +} diff --git a/security/selinux/ss/ebitmap.h b/security/selinux/ss/ebitmap.h index f283b4367f54..1f4e93c2ae86 100644 --- a/security/selinux/ss/ebitmap.h +++ b/security/selinux/ss/ebitmap.h @@ -123,6 +123,7 @@ int ebitmap_get_bit(struct ebitmap *e, unsigned long bit); int ebitmap_set_bit(struct ebitmap *e, unsigned long bit, int value); void ebitmap_destroy(struct ebitmap *e); int ebitmap_read(struct ebitmap *e, void *fp); +int ebitmap_write(struct ebitmap *e, void *fp); #ifdef CONFIG_NETLABEL int ebitmap_netlbl_export(struct ebitmap *ebmap, diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index 3a29704be8ce..94f630d93a5c 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -37,6 +37,7 @@ #include "policydb.h" #include "conditional.h" #include "mls.h" +#include "services.h" #define _DEBUG_HASHES @@ -185,9 +186,19 @@ static u32 rangetr_hash(struct hashtab *h, const void *k) static int rangetr_cmp(struct hashtab *h, const void *k1, const void *k2) { const struct range_trans *key1 = k1, *key2 = k2; - return (key1->source_type != key2->source_type || - key1->target_type != key2->target_type || - key1->target_class != key2->target_class); + int v; + + v = key1->source_type - key2->source_type; + if (v) + return v; + + v = key1->target_type - key2->target_type; + if (v) + return v; + + v = key1->target_class - key2->target_class; + + return v; } /* @@ -1624,11 +1635,11 @@ static int role_bounds_sanity_check(void *key, void *datum, void *datap) static int type_bounds_sanity_check(void *key, void *datum, void *datap) { - struct type_datum *upper, *type; + struct type_datum *upper; struct policydb *p = datap; int depth = 0; - upper = type = datum; + upper = datum; while (upper->bounds) { if (++depth == POLICYDB_BOUNDS_MAXDEPTH) { printk(KERN_ERR "SELinux: type %s: " @@ -2306,3 +2317,843 @@ bad: policydb_destroy(p); goto out; } + +/* + * Write a MLS level structure to a policydb binary + * representation file. + */ +static int mls_write_level(struct mls_level *l, void *fp) +{ + __le32 buf[1]; + int rc; + + buf[0] = cpu_to_le32(l->sens); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + rc = ebitmap_write(&l->cat, fp); + if (rc) + return rc; + + return 0; +} + +/* + * Write a MLS range structure to a policydb binary + * representation file. + */ +static int mls_write_range_helper(struct mls_range *r, void *fp) +{ + __le32 buf[3]; + size_t items; + int rc, eq; + + eq = mls_level_eq(&r->level[1], &r->level[0]); + + if (eq) + items = 2; + else + items = 3; + buf[0] = cpu_to_le32(items-1); + buf[1] = cpu_to_le32(r->level[0].sens); + if (!eq) + buf[2] = cpu_to_le32(r->level[1].sens); + + BUG_ON(items > (sizeof(buf)/sizeof(buf[0]))); + + rc = put_entry(buf, sizeof(u32), items, fp); + if (rc) + return rc; + + rc = ebitmap_write(&r->level[0].cat, fp); + if (rc) + return rc; + if (!eq) { + rc = ebitmap_write(&r->level[1].cat, fp); + if (rc) + return rc; + } + + return 0; +} + +static int sens_write(void *vkey, void *datum, void *ptr) +{ + char *key = vkey; + struct level_datum *levdatum = datum; + struct policy_data *pd = ptr; + void *fp = pd->fp; + __le32 buf[2]; + size_t len; + int rc; + + len = strlen(key); + buf[0] = cpu_to_le32(len); + buf[1] = cpu_to_le32(levdatum->isalias); + rc = put_entry(buf, sizeof(u32), 2, fp); + if (rc) + return rc; + + rc = put_entry(key, 1, len, fp); + if (rc) + return rc; + + rc = mls_write_level(levdatum->level, fp); + if (rc) + return rc; + + return 0; +} + +static int cat_write(void *vkey, void *datum, void *ptr) +{ + char *key = vkey; + struct cat_datum *catdatum = datum; + struct policy_data *pd = ptr; + void *fp = pd->fp; + __le32 buf[3]; + size_t len; + int rc; + + len = strlen(key); + buf[0] = cpu_to_le32(len); + buf[1] = cpu_to_le32(catdatum->value); + buf[2] = cpu_to_le32(catdatum->isalias); + rc = put_entry(buf, sizeof(u32), 3, fp); + if (rc) + return rc; + + rc = put_entry(key, 1, len, fp); + if (rc) + return rc; + + return 0; +} + +static int role_trans_write(struct role_trans *r, void *fp) +{ + struct role_trans *tr; + u32 buf[3]; + size_t nel; + int rc; + + nel = 0; + for (tr = r; tr; tr = tr->next) + nel++; + buf[0] = cpu_to_le32(nel); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + for (tr = r; tr; tr = tr->next) { + buf[0] = cpu_to_le32(tr->role); + buf[1] = cpu_to_le32(tr->type); + buf[2] = cpu_to_le32(tr->new_role); + rc = put_entry(buf, sizeof(u32), 3, fp); + if (rc) + return rc; + } + + return 0; +} + +static int role_allow_write(struct role_allow *r, void *fp) +{ + struct role_allow *ra; + u32 buf[2]; + size_t nel; + int rc; + + nel = 0; + for (ra = r; ra; ra = ra->next) + nel++; + buf[0] = cpu_to_le32(nel); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + for (ra = r; ra; ra = ra->next) { + buf[0] = cpu_to_le32(ra->role); + buf[1] = cpu_to_le32(ra->new_role); + rc = put_entry(buf, sizeof(u32), 2, fp); + if (rc) + return rc; + } + return 0; +} + +/* + * Write a security context structure + * to a policydb binary representation file. + */ +static int context_write(struct policydb *p, struct context *c, + void *fp) +{ + int rc; + __le32 buf[3]; + + buf[0] = cpu_to_le32(c->user); + buf[1] = cpu_to_le32(c->role); + buf[2] = cpu_to_le32(c->type); + + rc = put_entry(buf, sizeof(u32), 3, fp); + if (rc) + return rc; + + rc = mls_write_range_helper(&c->range, fp); + if (rc) + return rc; + + return 0; +} + +/* + * The following *_write functions are used to + * write the symbol data to a policy database + * binary representation file. + */ + +static int perm_write(void *vkey, void *datum, void *fp) +{ + char *key = vkey; + struct perm_datum *perdatum = datum; + __le32 buf[2]; + size_t len; + int rc; + + len = strlen(key); + buf[0] = cpu_to_le32(len); + buf[1] = cpu_to_le32(perdatum->value); + rc = put_entry(buf, sizeof(u32), 2, fp); + if (rc) + return rc; + + rc = put_entry(key, 1, len, fp); + if (rc) + return rc; + + return 0; +} + +static int common_write(void *vkey, void *datum, void *ptr) +{ + char *key = vkey; + struct common_datum *comdatum = datum; + struct policy_data *pd = ptr; + void *fp = pd->fp; + __le32 buf[4]; + size_t len; + int rc; + + len = strlen(key); + buf[0] = cpu_to_le32(len); + buf[1] = cpu_to_le32(comdatum->value); + buf[2] = cpu_to_le32(comdatum->permissions.nprim); + buf[3] = cpu_to_le32(comdatum->permissions.table->nel); + rc = put_entry(buf, sizeof(u32), 4, fp); + if (rc) + return rc; + + rc = put_entry(key, 1, len, fp); + if (rc) + return rc; + + rc = hashtab_map(comdatum->permissions.table, perm_write, fp); + if (rc) + return rc; + + return 0; +} + +static int write_cons_helper(struct policydb *p, struct constraint_node *node, + void *fp) +{ + struct constraint_node *c; + struct constraint_expr *e; + __le32 buf[3]; + u32 nel; + int rc; + + for (c = node; c; c = c->next) { + nel = 0; + for (e = c->expr; e; e = e->next) + nel++; + buf[0] = cpu_to_le32(c->permissions); + buf[1] = cpu_to_le32(nel); + rc = put_entry(buf, sizeof(u32), 2, fp); + if (rc) + return rc; + for (e = c->expr; e; e = e->next) { + buf[0] = cpu_to_le32(e->expr_type); + buf[1] = cpu_to_le32(e->attr); + buf[2] = cpu_to_le32(e->op); + rc = put_entry(buf, sizeof(u32), 3, fp); + if (rc) + return rc; + + switch (e->expr_type) { + case CEXPR_NAMES: + rc = ebitmap_write(&e->names, fp); + if (rc) + return rc; + break; + default: + break; + } + } + } + + return 0; +} + +static int class_write(void *vkey, void *datum, void *ptr) +{ + char *key = vkey; + struct class_datum *cladatum = datum; + struct policy_data *pd = ptr; + void *fp = pd->fp; + struct policydb *p = pd->p; + struct constraint_node *c; + __le32 buf[6]; + u32 ncons; + size_t len, len2; + int rc; + + len = strlen(key); + if (cladatum->comkey) + len2 = strlen(cladatum->comkey); + else + len2 = 0; + + ncons = 0; + for (c = cladatum->constraints; c; c = c->next) + ncons++; + + buf[0] = cpu_to_le32(len); + buf[1] = cpu_to_le32(len2); + buf[2] = cpu_to_le32(cladatum->value); + buf[3] = cpu_to_le32(cladatum->permissions.nprim); + if (cladatum->permissions.table) + buf[4] = cpu_to_le32(cladatum->permissions.table->nel); + else + buf[4] = 0; + buf[5] = cpu_to_le32(ncons); + rc = put_entry(buf, sizeof(u32), 6, fp); + if (rc) + return rc; + + rc = put_entry(key, 1, len, fp); + if (rc) + return rc; + + if (cladatum->comkey) { + rc = put_entry(cladatum->comkey, 1, len2, fp); + if (rc) + return rc; + } + + rc = hashtab_map(cladatum->permissions.table, perm_write, fp); + if (rc) + return rc; + + rc = write_cons_helper(p, cladatum->constraints, fp); + if (rc) + return rc; + + /* write out the validatetrans rule */ + ncons = 0; + for (c = cladatum->validatetrans; c; c = c->next) + ncons++; + + buf[0] = cpu_to_le32(ncons); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + rc = write_cons_helper(p, cladatum->validatetrans, fp); + if (rc) + return rc; + + return 0; +} + +static int role_write(void *vkey, void *datum, void *ptr) +{ + char *key = vkey; + struct role_datum *role = datum; + struct policy_data *pd = ptr; + void *fp = pd->fp; + struct policydb *p = pd->p; + __le32 buf[3]; + size_t items, len; + int rc; + + len = strlen(key); + items = 0; + buf[items++] = cpu_to_le32(len); + buf[items++] = cpu_to_le32(role->value); + if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) + buf[items++] = cpu_to_le32(role->bounds); + + BUG_ON(items > (sizeof(buf)/sizeof(buf[0]))); + + rc = put_entry(buf, sizeof(u32), items, fp); + if (rc) + return rc; + + rc = put_entry(key, 1, len, fp); + if (rc) + return rc; + + rc = ebitmap_write(&role->dominates, fp); + if (rc) + return rc; + + rc = ebitmap_write(&role->types, fp); + if (rc) + return rc; + + return 0; +} + +static int type_write(void *vkey, void *datum, void *ptr) +{ + char *key = vkey; + struct type_datum *typdatum = datum; + struct policy_data *pd = ptr; + struct policydb *p = pd->p; + void *fp = pd->fp; + __le32 buf[4]; + int rc; + size_t items, len; + + len = strlen(key); + items = 0; + buf[items++] = cpu_to_le32(len); + buf[items++] = cpu_to_le32(typdatum->value); + if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) { + u32 properties = 0; + + if (typdatum->primary) + properties |= TYPEDATUM_PROPERTY_PRIMARY; + + if (typdatum->attribute) + properties |= TYPEDATUM_PROPERTY_ATTRIBUTE; + + buf[items++] = cpu_to_le32(properties); + buf[items++] = cpu_to_le32(typdatum->bounds); + } else { + buf[items++] = cpu_to_le32(typdatum->primary); + } + BUG_ON(items > (sizeof(buf) / sizeof(buf[0]))); + rc = put_entry(buf, sizeof(u32), items, fp); + if (rc) + return rc; + + rc = put_entry(key, 1, len, fp); + if (rc) + return rc; + + return 0; +} + +static int user_write(void *vkey, void *datum, void *ptr) +{ + char *key = vkey; + struct user_datum *usrdatum = datum; + struct policy_data *pd = ptr; + struct policydb *p = pd->p; + void *fp = pd->fp; + __le32 buf[3]; + size_t items, len; + int rc; + + len = strlen(key); + items = 0; + buf[items++] = cpu_to_le32(len); + buf[items++] = cpu_to_le32(usrdatum->value); + if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) + buf[items++] = cpu_to_le32(usrdatum->bounds); + BUG_ON(items > (sizeof(buf) / sizeof(buf[0]))); + rc = put_entry(buf, sizeof(u32), items, fp); + if (rc) + return rc; + + rc = put_entry(key, 1, len, fp); + if (rc) + return rc; + + rc = ebitmap_write(&usrdatum->roles, fp); + if (rc) + return rc; + + rc = mls_write_range_helper(&usrdatum->range, fp); + if (rc) + return rc; + + rc = mls_write_level(&usrdatum->dfltlevel, fp); + if (rc) + return rc; + + return 0; +} + +static int (*write_f[SYM_NUM]) (void *key, void *datum, + void *datap) = +{ + common_write, + class_write, + role_write, + type_write, + user_write, + cond_write_bool, + sens_write, + cat_write, +}; + +static int ocontext_write(struct policydb *p, struct policydb_compat_info *info, + void *fp) +{ + unsigned int i, j, rc; + size_t nel, len; + __le32 buf[3]; + u32 nodebuf[8]; + struct ocontext *c; + for (i = 0; i < info->ocon_num; i++) { + nel = 0; + for (c = p->ocontexts[i]; c; c = c->next) + nel++; + buf[0] = cpu_to_le32(nel); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + for (c = p->ocontexts[i]; c; c = c->next) { + switch (i) { + case OCON_ISID: + buf[0] = cpu_to_le32(c->sid[0]); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + rc = context_write(p, &c->context[0], fp); + if (rc) + return rc; + break; + case OCON_FS: + case OCON_NETIF: + len = strlen(c->u.name); + buf[0] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + rc = put_entry(c->u.name, 1, len, fp); + if (rc) + return rc; + rc = context_write(p, &c->context[0], fp); + if (rc) + return rc; + rc = context_write(p, &c->context[1], fp); + if (rc) + return rc; + break; + case OCON_PORT: + buf[0] = cpu_to_le32(c->u.port.protocol); + buf[1] = cpu_to_le32(c->u.port.low_port); + buf[2] = cpu_to_le32(c->u.port.high_port); + rc = put_entry(buf, sizeof(u32), 3, fp); + if (rc) + return rc; + rc = context_write(p, &c->context[0], fp); + if (rc) + return rc; + break; + case OCON_NODE: + nodebuf[0] = c->u.node.addr; /* network order */ + nodebuf[1] = c->u.node.mask; /* network order */ + rc = put_entry(nodebuf, sizeof(u32), 2, fp); + if (rc) + return rc; + rc = context_write(p, &c->context[0], fp); + if (rc) + return rc; + break; + case OCON_FSUSE: + buf[0] = cpu_to_le32(c->v.behavior); + len = strlen(c->u.name); + buf[1] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 2, fp); + if (rc) + return rc; + rc = put_entry(c->u.name, 1, len, fp); + if (rc) + return rc; + rc = context_write(p, &c->context[0], fp); + if (rc) + return rc; + break; + case OCON_NODE6: + for (j = 0; j < 4; j++) + nodebuf[j] = c->u.node6.addr[j]; /* network order */ + for (j = 0; j < 4; j++) + nodebuf[j + 4] = c->u.node6.mask[j]; /* network order */ + rc = put_entry(nodebuf, sizeof(u32), 8, fp); + if (rc) + return rc; + rc = context_write(p, &c->context[0], fp); + if (rc) + return rc; + break; + } + } + } + return 0; +} + +static int genfs_write(struct policydb *p, void *fp) +{ + struct genfs *genfs; + struct ocontext *c; + size_t len; + __le32 buf[1]; + int rc; + + len = 0; + for (genfs = p->genfs; genfs; genfs = genfs->next) + len++; + buf[0] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + for (genfs = p->genfs; genfs; genfs = genfs->next) { + len = strlen(genfs->fstype); + buf[0] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + rc = put_entry(genfs->fstype, 1, len, fp); + if (rc) + return rc; + len = 0; + for (c = genfs->head; c; c = c->next) + len++; + buf[0] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + for (c = genfs->head; c; c = c->next) { + len = strlen(c->u.name); + buf[0] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + rc = put_entry(c->u.name, 1, len, fp); + if (rc) + return rc; + buf[0] = cpu_to_le32(c->v.sclass); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + rc = context_write(p, &c->context[0], fp); + if (rc) + return rc; + } + } + return 0; +} + +static int range_count(void *key, void *data, void *ptr) +{ + int *cnt = ptr; + *cnt = *cnt + 1; + + return 0; +} + +static int range_write_helper(void *key, void *data, void *ptr) +{ + __le32 buf[2]; + struct range_trans *rt = key; + struct mls_range *r = data; + struct policy_data *pd = ptr; + void *fp = pd->fp; + struct policydb *p = pd->p; + int rc; + + buf[0] = cpu_to_le32(rt->source_type); + buf[1] = cpu_to_le32(rt->target_type); + rc = put_entry(buf, sizeof(u32), 2, fp); + if (rc) + return rc; + if (p->policyvers >= POLICYDB_VERSION_RANGETRANS) { + buf[0] = cpu_to_le32(rt->target_class); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + } + rc = mls_write_range_helper(r, fp); + if (rc) + return rc; + + return 0; +} + +static int range_write(struct policydb *p, void *fp) +{ + size_t nel; + __le32 buf[1]; + int rc; + struct policy_data pd; + + pd.p = p; + pd.fp = fp; + + /* count the number of entries in the hashtab */ + nel = 0; + rc = hashtab_map(p->range_tr, range_count, &nel); + if (rc) + return rc; + + buf[0] = cpu_to_le32(nel); + rc = put_entry(buf, sizeof(u32), 1, fp); + if (rc) + return rc; + + /* actually write all of the entries */ + rc = hashtab_map(p->range_tr, range_write_helper, &pd); + if (rc) + return rc; + + return 0; +} + +/* + * Write the configuration data in a policy database + * structure to a policy database binary representation + * file. + */ +int policydb_write(struct policydb *p, void *fp) +{ + unsigned int i, num_syms; + int rc; + __le32 buf[4]; + u32 config; + size_t len; + struct policydb_compat_info *info; + + /* + * refuse to write policy older than compressed avtab + * to simplify the writer. There are other tests dropped + * since we assume this throughout the writer code. Be + * careful if you ever try to remove this restriction + */ + if (p->policyvers < POLICYDB_VERSION_AVTAB) { + printk(KERN_ERR "SELinux: refusing to write policy version %d." + " Because it is less than version %d\n", p->policyvers, + POLICYDB_VERSION_AVTAB); + return -EINVAL; + } + + config = 0; + if (p->mls_enabled) + config |= POLICYDB_CONFIG_MLS; + + if (p->reject_unknown) + config |= REJECT_UNKNOWN; + if (p->allow_unknown) + config |= ALLOW_UNKNOWN; + + /* Write the magic number and string identifiers. */ + buf[0] = cpu_to_le32(POLICYDB_MAGIC); + len = strlen(POLICYDB_STRING); + buf[1] = cpu_to_le32(len); + rc = put_entry(buf, sizeof(u32), 2, fp); + if (rc) + return rc; + rc = put_entry(POLICYDB_STRING, 1, len, fp); + if (rc) + return rc; + + /* Write the version, config, and table sizes. */ + info = policydb_lookup_compat(p->policyvers); + if (!info) { + printk(KERN_ERR "SELinux: compatibility lookup failed for policy " + "version %d", p->policyvers); + return rc; + } + + buf[0] = cpu_to_le32(p->policyvers); + buf[1] = cpu_to_le32(config); + buf[2] = cpu_to_le32(info->sym_num); + buf[3] = cpu_to_le32(info->ocon_num); + + rc = put_entry(buf, sizeof(u32), 4, fp); + if (rc) + return rc; + + if (p->policyvers >= POLICYDB_VERSION_POLCAP) { + rc = ebitmap_write(&p->policycaps, fp); + if (rc) + return rc; + } + + if (p->policyvers >= POLICYDB_VERSION_PERMISSIVE) { + rc = ebitmap_write(&p->permissive_map, fp); + if (rc) + return rc; + } + + num_syms = info->sym_num; + for (i = 0; i < num_syms; i++) { + struct policy_data pd; + + pd.fp = fp; + pd.p = p; + + buf[0] = cpu_to_le32(p->symtab[i].nprim); + buf[1] = cpu_to_le32(p->symtab[i].table->nel); + + rc = put_entry(buf, sizeof(u32), 2, fp); + if (rc) + return rc; + rc = hashtab_map(p->symtab[i].table, write_f[i], &pd); + if (rc) + return rc; + } + + rc = avtab_write(p, &p->te_avtab, fp); + if (rc) + return rc; + + rc = cond_write_list(p, p->cond_list, fp); + if (rc) + return rc; + + rc = role_trans_write(p->role_tr, fp); + if (rc) + return rc; + + rc = role_allow_write(p->role_allow, fp); + if (rc) + return rc; + + rc = ocontext_write(p, info, fp); + if (rc) + return rc; + + rc = genfs_write(p, fp); + if (rc) + return rc; + + rc = range_write(p, fp); + if (rc) + return rc; + + for (i = 0; i < p->p_types.nprim; i++) { + struct ebitmap *e = flex_array_get(p->type_attr_map_array, i); + + BUG_ON(!e); + rc = ebitmap_write(e, fp); + if (rc) + return rc; + } + + return 0; +} diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h index 310e94442cb8..95d3d7de361e 100644 --- a/security/selinux/ss/policydb.h +++ b/security/selinux/ss/policydb.h @@ -254,6 +254,9 @@ struct policydb { struct ebitmap permissive_map; + /* length of this policy when it was loaded */ + size_t len; + unsigned int policyvers; unsigned int reject_unknown : 1; @@ -270,6 +273,7 @@ extern int policydb_class_isvalid(struct policydb *p, unsigned int class); extern int policydb_type_isvalid(struct policydb *p, unsigned int type); extern int policydb_role_isvalid(struct policydb *p, unsigned int role); extern int policydb_read(struct policydb *p, void *fp); +extern int policydb_write(struct policydb *p, void *fp); #define PERM_SYMTAB_SIZE 32 @@ -290,6 +294,11 @@ struct policy_file { size_t len; }; +struct policy_data { + struct policydb *p; + void *fp; +}; + static inline int next_entry(void *buf, struct policy_file *fp, size_t bytes) { if (bytes > fp->len) @@ -301,6 +310,17 @@ static inline int next_entry(void *buf, struct policy_file *fp, size_t bytes) return 0; } +static inline int put_entry(void *buf, size_t bytes, int num, struct policy_file *fp) +{ + size_t len = bytes * num; + + memcpy(fp->data, buf, len); + fp->data += len; + fp->len -= len; + + return 0; +} + extern u16 string_to_security_class(struct policydb *p, const char *name); extern u32 string_to_av_perm(struct policydb *p, u16 tclass, const char *name); diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 9ea2feca3cd4..223c1ff6ef23 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -51,6 +51,7 @@ #include <linux/mutex.h> #include <linux/selinux.h> #include <linux/flex_array.h> +#include <linux/vmalloc.h> #include <net/netlabel.h> #include "flask.h" @@ -991,7 +992,8 @@ static int context_struct_to_string(struct context *context, char **scontext, u3 { char *scontextp; - *scontext = NULL; + if (scontext) + *scontext = NULL; *scontext_len = 0; if (context->len) { @@ -1008,6 +1010,9 @@ static int context_struct_to_string(struct context *context, char **scontext, u3 *scontext_len += strlen(policydb.p_type_val_to_name[context->type - 1]) + 1; *scontext_len += mls_compute_context_len(context); + if (!scontext) + return 0; + /* Allocate space for the context; caller must free this space. */ scontextp = kmalloc(*scontext_len, GFP_ATOMIC); if (!scontextp) @@ -1047,7 +1052,8 @@ static int security_sid_to_context_core(u32 sid, char **scontext, struct context *context; int rc = 0; - *scontext = NULL; + if (scontext) + *scontext = NULL; *scontext_len = 0; if (!ss_initialized) { @@ -1055,6 +1061,8 @@ static int security_sid_to_context_core(u32 sid, char **scontext, char *scontextp; *scontext_len = strlen(initial_sid_to_string[sid]) + 1; + if (!scontext) + goto out; scontextp = kmalloc(*scontext_len, GFP_ATOMIC); if (!scontextp) { rc = -ENOMEM; @@ -1769,6 +1777,7 @@ int security_load_policy(void *data, size_t len) return rc; } + policydb.len = len; rc = selinux_set_mapping(&policydb, secclass_map, ¤t_mapping, ¤t_mapping_size); @@ -1791,6 +1800,7 @@ int security_load_policy(void *data, size_t len) selinux_complete_init(); avc_ss_reset(seqno); selnl_notify_policyload(seqno); + selinux_status_update_policyload(seqno); selinux_netlbl_cache_invalidate(); selinux_xfrm_notify_policyload(); return 0; @@ -1804,6 +1814,7 @@ int security_load_policy(void *data, size_t len) if (rc) return rc; + newpolicydb.len = len; /* If switching between different policy types, log MLS status */ if (policydb.mls_enabled && !newpolicydb.mls_enabled) printk(KERN_INFO "SELinux: Disabling MLS support...\n"); @@ -1870,6 +1881,7 @@ int security_load_policy(void *data, size_t len) avc_ss_reset(seqno); selnl_notify_policyload(seqno); + selinux_status_update_policyload(seqno); selinux_netlbl_cache_invalidate(); selinux_xfrm_notify_policyload(); @@ -1883,6 +1895,17 @@ err: } +size_t security_policydb_len(void) +{ + size_t len; + + read_lock(&policy_rwlock); + len = policydb.len; + read_unlock(&policy_rwlock); + + return len; +} + /** * security_port_sid - Obtain the SID for a port. * @protocol: protocol number @@ -2374,6 +2397,7 @@ out: if (!rc) { avc_ss_reset(seqno); selnl_notify_policyload(seqno); + selinux_status_update_policyload(seqno); selinux_xfrm_notify_policyload(); } return rc; @@ -3129,3 +3153,38 @@ netlbl_sid_to_secattr_failure: return rc; } #endif /* CONFIG_NETLABEL */ + +/** + * security_read_policy - read the policy. + * @data: binary policy data + * @len: length of data in bytes + * + */ +int security_read_policy(void **data, ssize_t *len) +{ + int rc; + struct policy_file fp; + + if (!ss_initialized) + return -EINVAL; + + *len = security_policydb_len(); + + *data = vmalloc_user(*len); + if (!*data) + return -ENOMEM; + + fp.data = *data; + fp.len = *len; + + read_lock(&policy_rwlock); + rc = policydb_write(&policydb, &fp); + read_unlock(&policy_rwlock); + + if (rc) + return rc; + + *len = (unsigned long)fp.data - (unsigned long)*data; + return 0; + +} diff --git a/security/selinux/ss/status.c b/security/selinux/ss/status.c new file mode 100644 index 000000000000..d982365f9d1a --- /dev/null +++ b/security/selinux/ss/status.c @@ -0,0 +1,126 @@ +/* + * mmap based event notifications for SELinux + * + * Author: KaiGai Kohei <kaigai@ak.jp.nec.com> + * + * Copyright (C) 2010 NEC corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + */ +#include <linux/kernel.h> +#include <linux/gfp.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include "avc.h" +#include "services.h" + +/* + * The selinux_status_page shall be exposed to userspace applications + * using mmap interface on /selinux/status. + * It enables to notify applications a few events that will cause reset + * of userspace access vector without context switching. + * + * The selinux_kernel_status structure on the head of status page is + * protected from concurrent accesses using seqlock logic, so userspace + * application should reference the status page according to the seqlock + * logic. + * + * Typically, application checks status->sequence at the head of access + * control routine. If it is odd-number, kernel is updating the status, + * so please wait for a moment. If it is changed from the last sequence + * number, it means something happen, so application will reset userspace + * avc, if needed. + * In most cases, application shall confirm the kernel status is not + * changed without any system call invocations. + */ +static struct page *selinux_status_page; +static DEFINE_MUTEX(selinux_status_lock); + +/* + * selinux_kernel_status_page + * + * It returns a reference to selinux_status_page. If the status page is + * not allocated yet, it also tries to allocate it at the first time. + */ +struct page *selinux_kernel_status_page(void) +{ + struct selinux_kernel_status *status; + struct page *result = NULL; + + mutex_lock(&selinux_status_lock); + if (!selinux_status_page) { + selinux_status_page = alloc_page(GFP_KERNEL|__GFP_ZERO); + + if (selinux_status_page) { + status = page_address(selinux_status_page); + + status->version = SELINUX_KERNEL_STATUS_VERSION; + status->sequence = 0; + status->enforcing = selinux_enforcing; + /* + * NOTE: the next policyload event shall set + * a positive value on the status->policyload, + * although it may not be 1, but never zero. + * So, application can know it was updated. + */ + status->policyload = 0; + status->deny_unknown = !security_get_allow_unknown(); + } + } + result = selinux_status_page; + mutex_unlock(&selinux_status_lock); + + return result; +} + +/* + * selinux_status_update_setenforce + * + * It updates status of the current enforcing/permissive mode. + */ +void selinux_status_update_setenforce(int enforcing) +{ + struct selinux_kernel_status *status; + + mutex_lock(&selinux_status_lock); + if (selinux_status_page) { + status = page_address(selinux_status_page); + + status->sequence++; + smp_wmb(); + + status->enforcing = enforcing; + + smp_wmb(); + status->sequence++; + } + mutex_unlock(&selinux_status_lock); +} + +/* + * selinux_status_update_policyload + * + * It updates status of the times of policy reloaded, and current + * setting of deny_unknown. + */ +void selinux_status_update_policyload(int seqno) +{ + struct selinux_kernel_status *status; + + mutex_lock(&selinux_status_lock); + if (selinux_status_page) { + status = page_address(selinux_status_page); + + status->sequence++; + smp_wmb(); + + status->policyload = seqno; + status->deny_unknown = !security_get_allow_unknown(); + + smp_wmb(); + status->sequence++; + } + mutex_unlock(&selinux_status_lock); +} diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index c448d57ae2b7..bc39f4067af6 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1281,12 +1281,11 @@ static int smack_task_getioprio(struct task_struct *p) * * Return 0 if read access is permitted */ -static int smack_task_setscheduler(struct task_struct *p, int policy, - struct sched_param *lp) +static int smack_task_setscheduler(struct task_struct *p) { int rc; - rc = cap_task_setscheduler(p, policy, lp); + rc = cap_task_setscheduler(p); if (rc == 0) rc = smk_curacc_on_task(p, MAY_WRITE); return rc; @@ -3005,7 +3004,8 @@ static int smack_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) { char *sp = smack_from_secid(secid); - *secdata = sp; + if (secdata) + *secdata = sp; *seclen = strlen(sp); return 0; } diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c index c668b447c725..7556315c1978 100644 --- a/security/tomoyo/common.c +++ b/security/tomoyo/common.c @@ -768,8 +768,10 @@ static bool tomoyo_select_one(struct tomoyo_io_buffer *head, const char *data) return true; /* Do nothing if open(O_WRONLY). */ memset(&head->r, 0, sizeof(head->r)); head->r.print_this_domain_only = true; - head->r.eof = !domain; - head->r.domain = &domain->list; + if (domain) + head->r.domain = &domain->list; + else + head->r.eof = 1; tomoyo_io_printf(head, "# select %s\n", data); if (domain && domain->is_deleted) tomoyo_io_printf(head, "# This is a deleted domain.\n"); @@ -2051,13 +2053,22 @@ void tomoyo_check_profile(void) const u8 profile = domain->profile; if (tomoyo_profile_ptr[profile]) continue; + printk(KERN_ERR "You need to define profile %u before using it.\n", + profile); + printk(KERN_ERR "Please see http://tomoyo.sourceforge.jp/2.3/ " + "for more information.\n"); panic("Profile %u (used by '%s') not defined.\n", profile, domain->domainname->name); } tomoyo_read_unlock(idx); - if (tomoyo_profile_version != 20090903) + if (tomoyo_profile_version != 20090903) { + printk(KERN_ERR "You need to install userland programs for " + "TOMOYO 2.3 and initialize policy configuration.\n"); + printk(KERN_ERR "Please see http://tomoyo.sourceforge.jp/2.3/ " + "for more information.\n"); panic("Profile version %u is not supported.\n", tomoyo_profile_version); + } printk(KERN_INFO "TOMOYO: 2.3.0\n"); printk(KERN_INFO "Mandatory Access Control activated.\n"); } diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c index a7868ad4d530..cbbed0db9e56 100644 --- a/sound/core/rawmidi.c +++ b/sound/core/rawmidi.c @@ -535,13 +535,15 @@ static int snd_rawmidi_release(struct inode *inode, struct file *file) { struct snd_rawmidi_file *rfile; struct snd_rawmidi *rmidi; + struct module *module; rfile = file->private_data; rmidi = rfile->rmidi; rawmidi_release_priv(rfile); kfree(rfile); + module = rmidi->card->module; snd_card_file_remove(rmidi->card, file); - module_put(rmidi->card->module); + module_put(module); return 0; } diff --git a/sound/oss/soundcard.c b/sound/oss/soundcard.c index 92aa762ffb7e..07f803e6d203 100644 --- a/sound/oss/soundcard.c +++ b/sound/oss/soundcard.c @@ -391,11 +391,11 @@ static long sound_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case SND_DEV_DSP: case SND_DEV_DSP16: case SND_DEV_AUDIO: - return audio_ioctl(dev, file, cmd, p); + ret = audio_ioctl(dev, file, cmd, p); break; case SND_DEV_MIDIN: - return MIDIbuf_ioctl(dev, file, cmd, p); + ret = MIDIbuf_ioctl(dev, file, cmd, p); break; } diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c index 95148e58026c..c16c5ba0fda0 100644 --- a/sound/pci/hda/patch_sigmatel.c +++ b/sound/pci/hda/patch_sigmatel.c @@ -1747,6 +1747,8 @@ static struct snd_pci_quirk stac92hd71bxx_cfg_tbl[] = { "HP dv6", STAC_HP_DV5), SND_PCI_QUIRK(PCI_VENDOR_ID_HP, 0x3061, "HP dv6", STAC_HP_DV5), /* HP dv6-1110ax */ + SND_PCI_QUIRK(PCI_VENDOR_ID_HP, 0x363e, + "HP DV6", STAC_HP_DV5), SND_PCI_QUIRK_MASK(PCI_VENDOR_ID_HP, 0xfff0, 0x7010, "HP", STAC_HP_DV5), SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x0233, diff --git a/tools/perf/perf.h b/tools/perf/perf.h index ef7aa0a0c526..95aaf565c704 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -73,6 +73,18 @@ void get_term_dimensions(struct winsize *ws); #define cpu_relax() asm volatile("":::"memory") #endif +#ifdef __mips__ +#include "../../arch/mips/include/asm/unistd.h" +#define rmb() asm volatile( \ + ".set mips2\n\t" \ + "sync\n\t" \ + ".set mips0" \ + : /* no output */ \ + : /* no input */ \ + : "memory") +#define cpu_relax() asm volatile("" ::: "memory") +#endif + #include <time.h> #include <unistd.h> #include <sys/types.h> |