diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-04-11 10:51:26 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-04-11 10:51:26 -0700 |
commit | 8837c70d531a1788f975c366c254a5cb973a5291 (patch) | |
tree | f7a719d01090efb3bc534f5b0d7f13ec87eecadb | |
parent | b284d4d5a6785f8cd07eda2646a95782373cd01e (diff) | |
parent | b93b016313b3ba8003c3b8bb71f569af91f19fc7 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton:
- almost all of the rest of MM
- kasan updates
- lots of procfs work
- misc things
- lib/ updates
- checkpatch
- rapidio
- ipc/shm updates
- the start of willy's XArray conversion
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (140 commits)
page cache: use xa_lock
xarray: add the xa_lock to the radix_tree_root
fscache: use appropriate radix tree accessors
export __set_page_dirty
unicore32: turn flush_dcache_mmap_lock into a no-op
arm64: turn flush_dcache_mmap_lock into a no-op
mac80211_hwsim: use DEFINE_IDA
radix tree: use GFP_ZONEMASK bits of gfp_t for flags
linux/const.h: refactor _BITUL and _BITULL a bit
linux/const.h: move UL() macro to include/linux/const.h
linux/const.h: prefix include guard of uapi/linux/const.h with _UAPI
xen, mm: allow deferred page initialization for xen pv domains
elf: enforce MAP_FIXED on overlaying elf segments
fs, elf: drop MAP_FIXED usage from elf_map
mm: introduce MAP_FIXED_NOREPLACE
MAINTAINERS: update bouncing aacraid@adaptec.com addresses
fs/dcache.c: add cond_resched() in shrink_dentry_list()
include/linux/kfifo.h: fix comment
ipc/shm.c: shm_split(): remove unneeded test for NULL shm_file_data.vm_ops
kernel/sysctl.c: add kdoc comments to do_proc_do{u}intvec_minmax_conv_param
...
182 files changed, 4159 insertions, 2095 deletions
diff --git a/.clang-format b/.clang-format new file mode 100644 index 000000000000..faffc0d5af4e --- /dev/null +++ b/.clang-format @@ -0,0 +1,428 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# clang-format configuration file. Intended for clang-format >= 4. +# +# For more information, see: +# +# Documentation/process/clang-format.rst +# https://clang.llvm.org/docs/ClangFormat.html +# https://clang.llvm.org/docs/ClangFormatStyleOptions.html +# +--- +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +#AlignEscapedNewlines: Left # Unknown to clang-format-4.0 +AlignOperands: true +AlignTrailingComments: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: false +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: true + AfterNamespace: true + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + #AfterExternBlock: false # Unknown to clang-format-5.0 + BeforeCatch: false + BeforeElse: false + IndentBraces: false + #SplitEmptyFunction: true # Unknown to clang-format-4.0 + #SplitEmptyRecord: true # Unknown to clang-format-4.0 + #SplitEmptyNamespace: true # Unknown to clang-format-4.0 +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Custom +#BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0 +BreakBeforeTernaryOperators: false +BreakConstructorInitializersBeforeComma: false +#BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0 +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: false +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +#CompactNamespaces: false # Unknown to clang-format-4.0 +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 8 +ContinuationIndentWidth: 8 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +#FixNamespaceComments: false # Unknown to clang-format-4.0 + +# Taken from: +# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ \ +# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \ +# | sort | uniq +ForEachMacros: + - 'apei_estatus_for_each_section' + - 'ata_for_each_dev' + - 'ata_for_each_link' + - 'ax25_for_each' + - 'ax25_uid_for_each' + - 'bio_for_each_integrity_vec' + - '__bio_for_each_segment' + - 'bio_for_each_segment' + - 'bio_for_each_segment_all' + - 'bio_list_for_each' + - 'bip_for_each_vec' + - 'blkg_for_each_descendant_post' + - 'blkg_for_each_descendant_pre' + - 'blk_queue_for_each_rl' + - 'bond_for_each_slave' + - 'bond_for_each_slave_rcu' + - 'btree_for_each_safe128' + - 'btree_for_each_safe32' + - 'btree_for_each_safe64' + - 'btree_for_each_safel' + - 'card_for_each_dev' + - 'cgroup_taskset_for_each' + - 'cgroup_taskset_for_each_leader' + - 'cpufreq_for_each_entry' + - 'cpufreq_for_each_entry_idx' + - 'cpufreq_for_each_valid_entry' + - 'cpufreq_for_each_valid_entry_idx' + - 'css_for_each_child' + - 'css_for_each_descendant_post' + - 'css_for_each_descendant_pre' + - 'device_for_each_child_node' + - 'drm_atomic_crtc_for_each_plane' + - 'drm_atomic_crtc_state_for_each_plane' + - 'drm_atomic_crtc_state_for_each_plane_state' + - 'drm_for_each_connector_iter' + - 'drm_for_each_crtc' + - 'drm_for_each_encoder' + - 'drm_for_each_encoder_mask' + - 'drm_for_each_fb' + - 'drm_for_each_legacy_plane' + - 'drm_for_each_plane' + - 'drm_for_each_plane_mask' + - 'drm_mm_for_each_hole' + - 'drm_mm_for_each_node' + - 'drm_mm_for_each_node_in_range' + - 'drm_mm_for_each_node_safe' + - 'for_each_active_drhd_unit' + - 'for_each_active_iommu' + - 'for_each_available_child_of_node' + - 'for_each_bio' + - 'for_each_board_func_rsrc' + - 'for_each_bvec' + - 'for_each_child_of_node' + - 'for_each_clear_bit' + - 'for_each_clear_bit_from' + - 'for_each_cmsghdr' + - 'for_each_compatible_node' + - 'for_each_console' + - 'for_each_cpu' + - 'for_each_cpu_and' + - 'for_each_cpu_not' + - 'for_each_cpu_wrap' + - 'for_each_dev_addr' + - 'for_each_dma_cap_mask' + - 'for_each_drhd_unit' + - 'for_each_dss_dev' + - 'for_each_efi_memory_desc' + - 'for_each_efi_memory_desc_in_map' + - 'for_each_endpoint_of_node' + - 'for_each_evictable_lru' + - 'for_each_fib6_node_rt_rcu' + - 'for_each_fib6_walker_rt' + - 'for_each_free_mem_range' + - 'for_each_free_mem_range_reverse' + - 'for_each_func_rsrc' + - 'for_each_hstate' + - 'for_each_if' + - 'for_each_iommu' + - 'for_each_ip_tunnel_rcu' + - 'for_each_irq_nr' + - 'for_each_lru' + - 'for_each_matching_node' + - 'for_each_matching_node_and_match' + - 'for_each_memblock' + - 'for_each_memblock_type' + - 'for_each_memcg_cache_index' + - 'for_each_mem_pfn_range' + - 'for_each_mem_range' + - 'for_each_mem_range_rev' + - 'for_each_migratetype_order' + - 'for_each_msi_entry' + - 'for_each_net' + - 'for_each_netdev' + - 'for_each_netdev_continue' + - 'for_each_netdev_continue_rcu' + - 'for_each_netdev_feature' + - 'for_each_netdev_in_bond_rcu' + - 'for_each_netdev_rcu' + - 'for_each_netdev_reverse' + - 'for_each_netdev_safe' + - 'for_each_net_rcu' + - 'for_each_new_connector_in_state' + - 'for_each_new_crtc_in_state' + - 'for_each_new_plane_in_state' + - 'for_each_new_private_obj_in_state' + - 'for_each_node' + - 'for_each_node_by_name' + - 'for_each_node_by_type' + - 'for_each_node_mask' + - 'for_each_node_state' + - 'for_each_node_with_cpus' + - 'for_each_node_with_property' + - 'for_each_of_allnodes' + - 'for_each_of_allnodes_from' + - 'for_each_of_pci_range' + - 'for_each_old_connector_in_state' + - 'for_each_old_crtc_in_state' + - 'for_each_oldnew_connector_in_state' + - 'for_each_oldnew_crtc_in_state' + - 'for_each_oldnew_plane_in_state' + - 'for_each_oldnew_private_obj_in_state' + - 'for_each_old_plane_in_state' + - 'for_each_old_private_obj_in_state' + - 'for_each_online_cpu' + - 'for_each_online_node' + - 'for_each_online_pgdat' + - 'for_each_pci_bridge' + - 'for_each_pci_dev' + - 'for_each_pci_msi_entry' + - 'for_each_populated_zone' + - 'for_each_possible_cpu' + - 'for_each_present_cpu' + - 'for_each_prime_number' + - 'for_each_prime_number_from' + - 'for_each_process' + - 'for_each_process_thread' + - 'for_each_property_of_node' + - 'for_each_reserved_mem_region' + - 'for_each_resv_unavail_range' + - 'for_each_rtdcom' + - 'for_each_rtdcom_safe' + - 'for_each_set_bit' + - 'for_each_set_bit_from' + - 'for_each_sg' + - 'for_each_sg_page' + - '__for_each_thread' + - 'for_each_thread' + - 'for_each_zone' + - 'for_each_zone_zonelist' + - 'for_each_zone_zonelist_nodemask' + - 'fwnode_for_each_available_child_node' + - 'fwnode_for_each_child_node' + - 'fwnode_graph_for_each_endpoint' + - 'gadget_for_each_ep' + - 'hash_for_each' + - 'hash_for_each_possible' + - 'hash_for_each_possible_rcu' + - 'hash_for_each_possible_rcu_notrace' + - 'hash_for_each_possible_safe' + - 'hash_for_each_rcu' + - 'hash_for_each_safe' + - 'hctx_for_each_ctx' + - 'hlist_bl_for_each_entry' + - 'hlist_bl_for_each_entry_rcu' + - 'hlist_bl_for_each_entry_safe' + - 'hlist_for_each' + - 'hlist_for_each_entry' + - 'hlist_for_each_entry_continue' + - 'hlist_for_each_entry_continue_rcu' + - 'hlist_for_each_entry_continue_rcu_bh' + - 'hlist_for_each_entry_from' + - 'hlist_for_each_entry_from_rcu' + - 'hlist_for_each_entry_rcu' + - 'hlist_for_each_entry_rcu_bh' + - 'hlist_for_each_entry_rcu_notrace' + - 'hlist_for_each_entry_safe' + - '__hlist_for_each_rcu' + - 'hlist_for_each_safe' + - 'hlist_nulls_for_each_entry' + - 'hlist_nulls_for_each_entry_from' + - 'hlist_nulls_for_each_entry_rcu' + - 'hlist_nulls_for_each_entry_safe' + - 'ide_host_for_each_port' + - 'ide_port_for_each_dev' + - 'ide_port_for_each_present_dev' + - 'idr_for_each_entry' + - 'idr_for_each_entry_continue' + - 'idr_for_each_entry_ul' + - 'inet_bind_bucket_for_each' + - 'inet_lhash2_for_each_icsk_rcu' + - 'iov_for_each' + - 'key_for_each' + - 'key_for_each_safe' + - 'klp_for_each_func' + - 'klp_for_each_object' + - 'kvm_for_each_memslot' + - 'kvm_for_each_vcpu' + - 'list_for_each' + - 'list_for_each_entry' + - 'list_for_each_entry_continue' + - 'list_for_each_entry_continue_rcu' + - 'list_for_each_entry_continue_reverse' + - 'list_for_each_entry_from' + - 'list_for_each_entry_from_reverse' + - 'list_for_each_entry_lockless' + - 'list_for_each_entry_rcu' + - 'list_for_each_entry_reverse' + - 'list_for_each_entry_safe' + - 'list_for_each_entry_safe_continue' + - 'list_for_each_entry_safe_from' + - 'list_for_each_entry_safe_reverse' + - 'list_for_each_prev' + - 'list_for_each_prev_safe' + - 'list_for_each_safe' + - 'llist_for_each' + - 'llist_for_each_entry' + - 'llist_for_each_entry_safe' + - 'llist_for_each_safe' + - 'media_device_for_each_entity' + - 'media_device_for_each_intf' + - 'media_device_for_each_link' + - 'media_device_for_each_pad' + - 'netdev_for_each_lower_dev' + - 'netdev_for_each_lower_private' + - 'netdev_for_each_lower_private_rcu' + - 'netdev_for_each_mc_addr' + - 'netdev_for_each_uc_addr' + - 'netdev_for_each_upper_dev_rcu' + - 'netdev_hw_addr_list_for_each' + - 'nft_rule_for_each_expr' + - 'nla_for_each_attr' + - 'nla_for_each_nested' + - 'nlmsg_for_each_attr' + - 'nlmsg_for_each_msg' + - 'nr_neigh_for_each' + - 'nr_neigh_for_each_safe' + - 'nr_node_for_each' + - 'nr_node_for_each_safe' + - 'of_for_each_phandle' + - 'of_property_for_each_string' + - 'of_property_for_each_u32' + - 'pci_bus_for_each_resource' + - 'ping_portaddr_for_each_entry' + - 'plist_for_each' + - 'plist_for_each_continue' + - 'plist_for_each_entry' + - 'plist_for_each_entry_continue' + - 'plist_for_each_entry_safe' + - 'plist_for_each_safe' + - 'pnp_for_each_card' + - 'pnp_for_each_dev' + - 'protocol_for_each_card' + - 'protocol_for_each_dev' + - 'queue_for_each_hw_ctx' + - 'radix_tree_for_each_contig' + - 'radix_tree_for_each_slot' + - 'radix_tree_for_each_tagged' + - 'rbtree_postorder_for_each_entry_safe' + - 'resource_list_for_each_entry' + - 'resource_list_for_each_entry_safe' + - 'rhl_for_each_entry_rcu' + - 'rhl_for_each_rcu' + - 'rht_for_each' + - 'rht_for_each_continue' + - 'rht_for_each_entry' + - 'rht_for_each_entry_continue' + - 'rht_for_each_entry_rcu' + - 'rht_for_each_entry_rcu_continue' + - 'rht_for_each_entry_safe' + - 'rht_for_each_rcu' + - 'rht_for_each_rcu_continue' + - '__rq_for_each_bio' + - 'rq_for_each_segment' + - 'scsi_for_each_prot_sg' + - 'scsi_for_each_sg' + - 'sctp_for_each_hentry' + - 'sctp_skb_for_each' + - 'shdma_for_each_chan' + - '__shost_for_each_device' + - 'shost_for_each_device' + - 'sk_for_each' + - 'sk_for_each_bound' + - 'sk_for_each_entry_offset_rcu' + - 'sk_for_each_from' + - 'sk_for_each_rcu' + - 'sk_for_each_safe' + - 'sk_nulls_for_each' + - 'sk_nulls_for_each_from' + - 'sk_nulls_for_each_rcu' + - 'snd_pcm_group_for_each_entry' + - 'snd_soc_dapm_widget_for_each_path' + - 'snd_soc_dapm_widget_for_each_path_safe' + - 'snd_soc_dapm_widget_for_each_sink_path' + - 'snd_soc_dapm_widget_for_each_source_path' + - 'tb_property_for_each' + - 'udp_portaddr_for_each_entry' + - 'udp_portaddr_for_each_entry_rcu' + - 'usb_hub_for_each_child' + - 'v4l2_device_for_each_subdev' + - 'v4l2_m2m_for_each_dst_buf' + - 'v4l2_m2m_for_each_dst_buf_safe' + - 'v4l2_m2m_for_each_src_buf' + - 'v4l2_m2m_for_each_src_buf_safe' + - 'zorro_for_each_dev' + +#IncludeBlocks: Preserve # Unknown to clang-format-5.0 +IncludeCategories: + - Regex: '.*' + Priority: 1 +IncludeIsMainRegex: '(Test)?$' +IndentCaseLabels: false +#IndentPPDirectives: None # Unknown to clang-format-5.0 +IndentWidth: 8 +IndentWrappedFunctionNames: true +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: Inner +#ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0 +ObjCBlockIndentWidth: 8 +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: true + +# Taken from git's rules +#PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0 +PenaltyBreakBeforeFirstCallParameter: 30 +PenaltyBreakComment: 10 +PenaltyBreakFirstLessLess: 0 +PenaltyBreakString: 10 +PenaltyExcessCharacter: 100 +PenaltyReturnTypeOnItsOwnLine: 60 + +PointerAlignment: Right +ReflowComments: false +SortIncludes: false +#SortUsingDeclarations: false # Unknown to clang-format-4.0 +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +#SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0 +#SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0 +SpaceBeforeParens: ControlStatements +#SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0 +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: false +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp03 +TabWidth: 8 +UseTab: Always +... diff --git a/.gitignore b/.gitignore index 85bcc2696442..a1dfd2acd9c3 100644 --- a/.gitignore +++ b/.gitignore @@ -81,6 +81,7 @@ modules.builtin !.gitignore !.mailmap !.cocciconfig +!.clang-format # # Generated include files diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt index a4af2e124e24..3682e99234c2 100644 --- a/Documentation/cgroup-v1/memory.txt +++ b/Documentation/cgroup-v1/memory.txt @@ -262,7 +262,7 @@ When oom event notifier is registered, event will be delivered. 2.6 Locking lock_page_cgroup()/unlock_page_cgroup() should not be called under - mapping->tree_lock. + the i_pages lock. Other lock order is following: PG_locked. diff --git a/Documentation/process/4.Coding.rst b/Documentation/process/4.Coding.rst index 26b106071364..eb4b185d168c 100644 --- a/Documentation/process/4.Coding.rst +++ b/Documentation/process/4.Coding.rst @@ -58,6 +58,14 @@ can never be transgressed. If there is a good reason to go against the style (a line which becomes far less readable if split to fit within the 80-column limit, for example), just do it. +Note that you can also use the ``clang-format`` tool to help you with +these rules, to quickly re-format parts of your code automatically, +and to review full files in order to spot coding style mistakes, +typos and possible improvements. It is also handy for sorting ``#includes``, +for aligning variables/macros, for reflowing text and other similar tasks. +See the file :ref:`Documentation/process/clang-format.rst <clangformat>` +for more details. + Abstraction layers ****************** diff --git a/Documentation/process/clang-format.rst b/Documentation/process/clang-format.rst new file mode 100644 index 000000000000..6710c0707721 --- /dev/null +++ b/Documentation/process/clang-format.rst @@ -0,0 +1,184 @@ +.. _clangformat: + +clang-format +============ + +``clang-format`` is a tool to format C/C++/... code according to +a set of rules and heuristics. Like most tools, it is not perfect +nor covers every single case, but it is good enough to be helpful. + +``clang-format`` can be used for several purposes: + + - Quickly reformat a block of code to the kernel style. Specially useful + when moving code around and aligning/sorting. See clangformatreformat_. + + - Spot style mistakes, typos and possible improvements in files + you maintain, patches you review, diffs, etc. See clangformatreview_. + + - Help you follow the coding style rules, specially useful for those + new to kernel development or working at the same time in several + projects with different coding styles. + +Its configuration file is ``.clang-format`` in the root of the kernel tree. +The rules contained there try to approximate the most common kernel +coding style. They also try to follow :ref:`Documentation/process/coding-style.rst <codingstyle>` +as much as possible. Since not all the kernel follows the same style, +it is possible that you may want to tweak the defaults for a particular +subsystem or folder. To do so, you can override the defaults by writing +another ``.clang-format`` file in a subfolder. + +The tool itself has already been included in the repositories of popular +Linux distributions for a long time. Search for ``clang-format`` in +your repositories. Otherwise, you can either download pre-built +LLVM/clang binaries or build the source code from: + + http://releases.llvm.org/download.html + +See more information about the tool at: + + https://clang.llvm.org/docs/ClangFormat.html + + https://clang.llvm.org/docs/ClangFormatStyleOptions.html + + +.. _clangformatreview: + +Review files and patches for coding style +----------------------------------------- + +By running the tool in its inline mode, you can review full subsystems, +folders or individual files for code style mistakes, typos or improvements. + +To do so, you can run something like:: + + # Make sure your working directory is clean! + clang-format -i kernel/*.[ch] + +And then take a look at the git diff. + +Counting the lines of such a diff is also useful for improving/tweaking +the style options in the configuration file; as well as testing new +``clang-format`` features/versions. + +``clang-format`` also supports reading unified diffs, so you can review +patches and git diffs easily. See the documentation at: + + https://clang.llvm.org/docs/ClangFormat.html#script-for-patch-reformatting + +To avoid ``clang-format`` formatting some portion of a file, you can do:: + + int formatted_code; + // clang-format off + void unformatted_code ; + // clang-format on + void formatted_code_again; + +While it might be tempting to use this to keep a file always in sync with +``clang-format``, specially if you are writing new files or if you are +a maintainer, please note that people might be running different +``clang-format`` versions or not have it available at all. Therefore, +you should probably refrain yourself from using this in kernel sources; +at least until we see if ``clang-format`` becomes commonplace. + + +.. _clangformatreformat: + +Reformatting blocks of code +--------------------------- + +By using an integration with your text editor, you can reformat arbitrary +blocks (selections) of code with a single keystroke. This is specially +useful when moving code around, for complex code that is deeply intended, +for multi-line macros (and aligning their backslashes), etc. + +Remember that you can always tweak the changes afterwards in those cases +where the tool did not do an optimal job. But as a first approximation, +it can be very useful. + +There are integrations for many popular text editors. For some of them, +like vim, emacs, BBEdit and Visual Studio you can find support built-in. +For instructions, read the appropiate section at: + + https://clang.llvm.org/docs/ClangFormat.html + +For Atom, Eclipse, Sublime Text, Visual Studio Code, XCode and other +editors and IDEs you should be able to find ready-to-use plugins. + +For this use case, consider using a secondary ``.clang-format`` +so that you can tweak a few options. See clangformatextra_. + + +.. _clangformatmissing: + +Missing support +--------------- + +``clang-format`` is missing support for some things that are common +in kernel code. They are easy to remember, so if you use the tool +regularly, you will quickly learn to avoid/ignore those. + +In particular, some very common ones you will notice are: + + - Aligned blocks of one-line ``#defines``, e.g.:: + + #define TRACING_MAP_BITS_DEFAULT 11 + #define TRACING_MAP_BITS_MAX 17 + #define TRACING_MAP_BITS_MIN 7 + + vs.:: + + #define TRACING_MAP_BITS_DEFAULT 11 + #define TRACING_MAP_BITS_MAX 17 + #define TRACING_MAP_BITS_MIN 7 + + - Aligned designated initializers, e.g.:: + + static const struct file_operations uprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, + }; + + vs.:: + + static const struct file_operations uprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, + }; + + +.. _clangformatextra: + +Extra features/options +---------------------- + +Some features/style options are not enabled by default in the configuration +file in order to minimize the differences between the output and the current +code. In other words, to make the difference as small as possible, +which makes reviewing full-file style, as well diffs and patches as easy +as possible. + +In other cases (e.g. particular subsystems/folders/files), the kernel style +might be different and enabling some of these options may approximate +better the style there. + +For instance: + + - Aligning assignments (``AlignConsecutiveAssignments``). + + - Aligning declarations (``AlignConsecutiveDeclarations``). + + - Reflowing text in comments (``ReflowComments``). + + - Sorting ``#includes`` (``SortIncludes``). + +They are typically useful for block re-formatting, rather than full-file. +You might want to create another ``.clang-format`` file and use that one +from your editor/IDE instead. diff --git a/Documentation/process/coding-style.rst b/Documentation/process/coding-style.rst index d98deb62c400..4e7c0a1c427a 100644 --- a/Documentation/process/coding-style.rst +++ b/Documentation/process/coding-style.rst @@ -631,6 +631,14 @@ options ``-kr -i8`` (stands for ``K&R, 8 character indents``), or use re-formatting you may want to take a look at the man page. But remember: ``indent`` is not a fix for bad programming. +Note that you can also use the ``clang-format`` tool to help you with +these rules, to quickly re-format parts of your code automatically, +and to review full files in order to spot coding style mistakes, +typos and possible improvements. It is also handy for sorting ``#includes``, +for aligning variables/macros, for reflowing text and other similar tasks. +See the file :ref:`Documentation/process/clang-format.rst <clangformat>` +for more details. + 10) Kconfig configuration files ------------------------------- diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 412314eebda6..eded671d55eb 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -964,32 +964,34 @@ detect a hard lockup condition. tainted: -Non-zero if the kernel has been tainted. Numeric values, which -can be ORed together: - - 1 - A module with a non-GPL license has been loaded, this - includes modules with no license. - Set by modutils >= 2.4.9 and module-init-tools. - 2 - A module was force loaded by insmod -f. - Set by modutils >= 2.4.9 and module-init-tools. - 4 - Unsafe SMP processors: SMP with CPUs not designed for SMP. - 8 - A module was forcibly unloaded from the system by rmmod -f. - 16 - A hardware machine check error occurred on the system. - 32 - A bad page was discovered on the system. - 64 - The user has asked that the system be marked "tainted". This - could be because they are running software that directly modifies - the hardware, or for other reasons. - 128 - The system has died. - 256 - The ACPI DSDT has been overridden with one supplied by the user - instead of using the one provided by the hardware. - 512 - A kernel warning has occurred. -1024 - A module from drivers/staging was loaded. -2048 - The system is working around a severe firmware bug. -4096 - An out-of-tree module has been loaded. -8192 - An unsigned module has been loaded in a kernel supporting module - signature. -16384 - A soft lockup has previously occurred on the system. -32768 - The kernel has been live patched. +Non-zero if the kernel has been tainted. Numeric values, which can be +ORed together. The letters are seen in "Tainted" line of Oops reports. + + 1 (P): A module with a non-GPL license has been loaded, this + includes modules with no license. + Set by modutils >= 2.4.9 and module-init-tools. + 2 (F): A module was force loaded by insmod -f. + Set by modutils >= 2.4.9 and module-init-tools. + 4 (S): Unsafe SMP processors: SMP with CPUs not designed for SMP. + 8 (R): A module was forcibly unloaded from the system by rmmod -f. + 16 (M): A hardware machine check error occurred on the system. + 32 (B): A bad page was discovered on the system. + 64 (U): The user has asked that the system be marked "tainted". This + could be because they are running software that directly modifies + the hardware, or for other reasons. + 128 (D): The system has died. + 256 (A): The ACPI DSDT has been overridden with one supplied by the user + instead of using the one provided by the hardware. + 512 (W): A kernel warning has occurred. + 1024 (C): A module from drivers/staging was loaded. + 2048 (I): The system is working around a severe firmware bug. + 4096 (O): An out-of-tree module has been loaded. + 8192 (E): An unsigned module has been loaded in a kernel supporting module + signature. + 16384 (L): A soft lockup has previously occurred on the system. + 32768 (K): The kernel has been live patched. + 65536 (X): Auxiliary taint, defined and used by for distros. +131072 (T): The kernel was built with the struct randomization plugin. ============================================================== diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index ff234d229cbb..17256f2ad919 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -312,8 +312,6 @@ The lowmem_reserve_ratio is an array. You can see them by reading this file. % cat /proc/sys/vm/lowmem_reserve_ratio 256 256 32 - -Note: # of this elements is one fewer than number of zones. Because the highest - zone's value is not necessary for following calculation. But, these values are not used directly. The kernel calculates # of protection pages for each zones from them. These are shown as array of protection pages @@ -364,7 +362,8 @@ As above expression, they are reciprocal number of ratio. pages of higher zones on the node. If you would like to protect more pages, smaller values are effective. -The minimum value is 1 (1/1 -> 100%). +The minimum value is 1 (1/1 -> 100%). The value less than 1 completely +disables protection of the pages. ============================================================== diff --git a/Documentation/vm/hmm.txt b/Documentation/vm/hmm.txt index 4d3aac9f4a5d..2d1d6f69e91b 100644 --- a/Documentation/vm/hmm.txt +++ b/Documentation/vm/hmm.txt @@ -1,152 +1,160 @@ Heterogeneous Memory Management (HMM) -Transparently allow any component of a program to use any memory region of said -program with a device without using device specific memory allocator. This is -becoming a requirement to simplify the use of advance heterogeneous computing -where GPU, DSP or FPGA are use to perform various computations. - -This document is divided as follow, in the first section i expose the problems -related to the use of a device specific allocator. The second section i expose -the hardware limitations that are inherent to many platforms. The third section -gives an overview of HMM designs. The fourth section explains how CPU page- -table mirroring works and what is HMM purpose in this context. Fifth section -deals with how device memory is represented inside the kernel. Finaly the last -section present the new migration helper that allow to leverage the device DMA -engine. - - -1) Problems of using device specific memory allocator: -2) System bus, device memory characteristics -3) Share address space and migration +Provide infrastructure and helpers to integrate non-conventional memory (device +memory like GPU on board memory) into regular kernel path, with the cornerstone +of this being specialized struct page for such memory (see sections 5 to 7 of +this document). + +HMM also provides optional helpers for SVM (Share Virtual Memory), i.e., +allowing a device to transparently access program address coherently with the +CPU meaning that any valid pointer on the CPU is also a valid pointer for the +device. This is becoming mandatory to simplify the use of advanced hetero- +geneous computing where GPU, DSP, or FPGA are used to perform various +computations on behalf of a process. + +This document is divided as follows: in the first section I expose the problems +related to using device specific memory allocators. In the second section, I +expose the hardware limitations that are inherent to many platforms. The third +section gives an overview of the HMM design. The fourth section explains how +CPU page-table mirroring works and the purpose of HMM in this context. The +fifth section deals with how device memory is represented inside the kernel. +Finally, the last section presents a new migration helper that allows lever- +aging the device DMA engine. + + +1) Problems of using a device specific memory allocator: +2) I/O bus, device memory characteristics +3) Shared address space and migration 4) Address space mirroring implementation and API 5) Represent and manage device memory from core kernel point of view -6) Migrate to and from device memory +6) Migration to and from device memory 7) Memory cgroup (memcg) and rss accounting ------------------------------------------------------------------------------- -1) Problems of using device specific memory allocator: - -Device with large amount of on board memory (several giga bytes) like GPU have -historically manage their memory through dedicated driver specific API. This -creates a disconnect between memory allocated and managed by device driver and -regular application memory (private anonymous, share memory or regular file -back memory). From here on i will refer to this aspect as split address space. -I use share address space to refer to the opposite situation ie one in which -any memory region can be use by device transparently. - -Split address space because device can only access memory allocated through the -device specific API. This imply that all memory object in a program are not -equal from device point of view which complicate large program that rely on a -wide set of libraries. - -Concretly this means that code that wants to leverage device like GPU need to -copy object between genericly allocated memory (malloc, mmap private/share/) -and memory allocated through the device driver API (this still end up with an -mmap but of the device file). - -For flat dataset (array, grid, image, ...) this isn't too hard to achieve but -complex data-set (list, tree, ...) are hard to get right. Duplicating a complex -data-set need to re-map all the pointer relations between each of its elements. -This is error prone and program gets harder to debug because of the duplicate -data-set. - -Split address space also means that library can not transparently use data they -are getting from core program or other library and thus each library might have -to duplicate its input data-set using specific memory allocator. Large project -suffer from this and waste resources because of the various memory copy. - -Duplicating each library API to accept as input or output memory allocted by +1) Problems of using a device specific memory allocator: + +Devices with a large amount of on board memory (several gigabytes) like GPUs +have historically managed their memory through dedicated driver specific APIs. +This creates a disconnect between memory allocated and managed by a device +driver and regular application memory (private anonymous, shared memory, or +regular file backed memory). From here on I will refer to this aspect as split +address space. I use shared address space to refer to the opposite situation: +i.e., one in which any application memory region can be used by a device +transparently. + +Split address space happens because device can only access memory allocated +through device specific API. This implies that all memory objects in a program +are not equal from the device point of view which complicates large programs +that rely on a wide set of libraries. + +Concretely this means that code that wants to leverage devices like GPUs needs +to copy object between generically allocated memory (malloc, mmap private, mmap +share) and memory allocated through the device driver API (this still ends up +with an mmap but of the device file). + +For flat data sets (array, grid, image, ...) this isn't too hard to achieve but +complex data sets (list, tree, ...) are hard to get right. Duplicating a +complex data set needs to re-map all the pointer relations between each of its +elements. This is error prone and program gets harder to debug because of the +duplicate data set and addresses. + +Split address space also means that libraries cannot transparently use data +they are getting from the core program or another library and thus each library +might have to duplicate its input data set using the device specific memory +allocator. Large projects suffer from this and waste resources because of the +various memory copies. + +Duplicating each library API to accept as input or output memory allocated by each device specific allocator is not a viable option. It would lead to a -combinatorial explosions in the library entry points. +combinatorial explosion in the library entry points. -Finaly with the advance of high level language constructs (in C++ but in other -language too) it is now possible for compiler to leverage GPU or other devices -without even the programmer knowledge. Some of compiler identified patterns are -only do-able with a share address. It is as well more reasonable to use a share -address space for all the other patterns. +Finally, with the advance of high level language constructs (in C++ but in +other languages too) it is now possible for the compiler to leverage GPUs and +other devices without programmer knowledge. Some compiler identified patterns +are only do-able with a shared address space. It is also more reasonable to use +a shared address space for all other patterns. ------------------------------------------------------------------------------- -2) System bus, device memory characteristics +2) I/O bus, device memory characteristics -System bus cripple share address due to few limitations. Most system bus only -allow basic memory access from device to main memory, even cache coherency is -often optional. Access to device memory from CPU is even more limited, most -often than not it is not cache coherent. +I/O buses cripple shared address spaces due to a few limitations. Most I/O +buses only allow basic memory access from device to main memory; even cache +coherency is often optional. Access to device memory from CPU is even more +limited. More often than not, it is not cache coherent. -If we only consider the PCIE bus than device can access main memory (often -through an IOMMU) and be cache coherent with the CPUs. However it only allows -a limited set of atomic operation from device on main memory. This is worse -in the other direction the CPUs can only access a limited range of the device -memory and can not perform atomic operations on it. Thus device memory can not -be consider like regular memory from kernel point of view. +If we only consider the PCIE bus, then a device can access main memory (often +through an IOMMU) and be cache coherent with the CPUs. However, it only allows +a limited set of atomic operations from device on main memory. This is worse +in the other direction: the CPU can only access a limited range of the device +memory and cannot perform atomic operations on it. Thus device memory cannot +be considered the same as regular memory from the kernel point of view. Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0 -and 16 lanes). This is 33 times less that fastest GPU memory (1 TBytes/s). -The final limitation is latency, access to main memory from the device has an -order of magnitude higher latency than when the device access its own memory. +and 16 lanes). This is 33 times less than the fastest GPU memory (1 TBytes/s). +The final limitation is latency. Access to main memory from the device has an +order of magnitude higher latency than when the device accesses its own memory. -Some platform are developing new system bus or additions/modifications to PCIE -to address some of those limitations (OpenCAPI, CCIX). They mainly allow two +Some platforms are developing new I/O buses or additions/modifications to PCIE +to address some of these limitations (OpenCAPI, CCIX). They mainly allow two- way cache coherency between CPU and device and allow all atomic operations the -architecture supports. Saddly not all platform are following this trends and -some major architecture are left without hardware solutions to those problems. +architecture supports. Sadly, not all platforms are following this trend and +some major architectures are left without hardware solutions to these problems. -So for share address space to make sense not only we must allow device to -access any memory memory but we must also permit any memory to be migrated to -device memory while device is using it (blocking CPU access while it happens). +So for shared address space to make sense, not only must we allow devices to +access any memory but we must also permit any memory to be migrated to device +memory while device is using it (blocking CPU access while it happens). ------------------------------------------------------------------------------- -3) Share address space and migration +3) Shared address space and migration HMM intends to provide two main features. First one is to share the address -space by duplication the CPU page table into the device page table so same -address point to same memory and this for any valid main memory address in +space by duplicating the CPU page table in the device page table so the same +address points to the same physical memory for any valid main memory address in the process address space. -To achieve this, HMM offer a set of helpers to populate the device page table +To achieve this, HMM offers a set of helpers to populate the device page table while keeping track of CPU page table updates. Device page table updates are -not as easy as CPU page table updates. To update the device page table you must -allow a buffer (or use a pool of pre-allocated buffer) and write GPU specifics -commands in it to perform the update (unmap, cache invalidations and flush, -...). This can not be done through common code for all device. Hence why HMM -provides helpers to factor out everything that can be while leaving the gory -details to the device driver. - -The second mechanism HMM provide is a new kind of ZONE_DEVICE memory that does -allow to allocate a struct page for each page of the device memory. Those page -are special because the CPU can not map them. They however allow to migrate -main memory to device memory using exhisting migration mechanism and everything -looks like if page was swap out to disk from CPU point of view. Using a struct -page gives the easiest and cleanest integration with existing mm mechanisms. -Again here HMM only provide helpers, first to hotplug new ZONE_DEVICE memory -for the device memory and second to perform migration. Policy decision of what -and when to migrate things is left to the device driver. - -Note that any CPU access to a device page trigger a page fault and a migration -back to main memory ie when a page backing an given address A is migrated from -a main memory page to a device page then any CPU access to address A trigger a -page fault and initiate a migration back to main memory. - - -With this two features, HMM not only allow a device to mirror a process address -space and keeps both CPU and device page table synchronize, but also allow to -leverage device memory by migrating part of data-set that is actively use by a -device. +not as easy as CPU page table updates. To update the device page table, you must +allocate a buffer (or use a pool of pre-allocated buffers) and write GPU +specific commands in it to perform the update (unmap, cache invalidations, and +flush, ...). This cannot be done through common code for all devices. Hence +why HMM provides helpers to factor out everything that can be while leaving the +hardware specific details to the device driver. + +The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that +allows allocating a struct page for each page of the device memory. Those pages +are special because the CPU cannot map them. However, they allow migrating +main memory to device memory using existing migration mechanisms and everything +looks like a page is swapped out to disk from the CPU point of view. Using a +struct page gives the easiest and cleanest integration with existing mm mech- +anisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE +memory for the device memory and second to perform migration. Policy decisions +of what and when to migrate things is left to the device driver. + +Note that any CPU access to a device page triggers a page fault and a migration +back to main memory. For example, when a page backing a given CPU address A is +migrated from a main memory page to a device page, then any CPU access to +address A triggers a page fault and initiates a migration back to main memory. + +With these two features, HMM not only allows a device to mirror process address +space and keeping both CPU and device page table synchronized, but also lever- +ages device memory by migrating the part of the data set that is actively being +used by the device. ------------------------------------------------------------------------------- 4) Address space mirroring implementation and API -Address space mirroring main objective is to allow to duplicate range of CPU -page table into a device page table and HMM helps keeping both synchronize. A -device driver that want to mirror a process address space must start with the +Address space mirroring's main objective is to allow duplication of a range of +CPU page table into a device page table; HMM helps keep both synchronized. A +device driver that wants to mirror a process address space must start with the registration of an hmm_mirror struct: int hmm_mirror_register(struct hmm_mirror *mirror, @@ -154,9 +162,9 @@ registration of an hmm_mirror struct: int hmm_mirror_register_locked(struct hmm_mirror *mirror, struct mm_struct *mm); -The locked variant is to be use when the driver is already holding the mmap_sem -of the mm in write mode. The mirror struct has a set of callback that are use -to propagate CPU page table: +The locked variant is to be used when the driver is already holding mmap_sem +of the mm in write mode. The mirror struct has a set of callbacks that are used +to propagate CPU page tables: struct hmm_mirror_ops { /* sync_cpu_device_pagetables() - synchronize page tables @@ -181,13 +189,13 @@ to propagate CPU page table: unsigned long end); }; -Device driver must perform update to the range following action (turn range -read only, or fully unmap, ...). Once driver callback returns the device must -be done with the update. +The device driver must perform the update action to the range (mark range +read only, or fully unmap, ...). The device must be done with the update before +the driver callback returns. -When device driver wants to populate a range of virtual address it can use -either: +When the device driver wants to populate a range of virtual addresses, it can +use either: int hmm_vma_get_pfns(struct vm_area_struct *vma, struct hmm_range *range, unsigned long start, @@ -201,17 +209,19 @@ either: bool write, bool block); -First one (hmm_vma_get_pfns()) will only fetch present CPU page table entry and -will not trigger a page fault on missing or non present entry. The second one -do trigger page fault on missing or read only entry if write parameter is true. -Page fault use the generic mm page fault code path just like a CPU page fault. +The first one (hmm_vma_get_pfns()) will only fetch present CPU page table +entries and will not trigger a page fault on missing or non-present entries. +The second one does trigger a page fault on missing or read-only entry if the +write parameter is true. Page faults use the generic mm page fault code path +just like a CPU page fault. -Both function copy CPU page table into their pfns array argument. Each entry in -that array correspond to an address in the virtual range. HMM provide a set of -flags to help driver identify special CPU page table entries. +Both functions copy CPU page table entries into their pfns array argument. Each +entry in that array corresponds to an address in the virtual range. HMM +provides a set of flags to help the driver identify special CPU page table +entries. Locking with the update() callback is the most important aspect the driver must -respect in order to keep things properly synchronize. The usage pattern is : +respect in order to keep things properly synchronized. The usage pattern is: int driver_populate_range(...) { @@ -233,43 +243,44 @@ respect in order to keep things properly synchronize. The usage pattern is : return 0; } -The driver->update lock is the same lock that driver takes inside its update() -callback. That lock must be call before hmm_vma_range_done() to avoid any race -with a concurrent CPU page table update. +The driver->update lock is the same lock that the driver takes inside its +update() callback. That lock must be held before hmm_vma_range_done() to avoid +any race with a concurrent CPU page table update. -HMM implements all this on top of the mmu_notifier API because we wanted to a -simpler API and also to be able to perform optimization latter own like doing -concurrent device update in multi-devices scenario. +HMM implements all this on top of the mmu_notifier API because we wanted a +simpler API and also to be able to perform optimizations latter on like doing +concurrent device updates in multi-devices scenario. -HMM also serve as an impedence missmatch between how CPU page table update are -done (by CPU write to the page table and TLB flushes) from how device update -their own page table. Device update is a multi-step process, first appropriate -commands are write to a buffer, then this buffer is schedule for execution on -the device. It is only once the device has executed commands in the buffer that -the update is done. Creating and scheduling update command buffer can happen -concurrently for multiple devices. Waiting for each device to report commands -as executed is serialize (there is no point in doing this concurrently). +HMM also serves as an impedance mismatch between how CPU page table updates +are done (by CPU write to the page table and TLB flushes) and how devices +update their own page table. Device updates are a multi-step process. First, +appropriate commands are written to a buffer, then this buffer is scheduled for +execution on the device. It is only once the device has executed commands in +the buffer that the update is done. Creating and scheduling the update command +buffer can happen concurrently for multiple devices. Waiting for each device to +report commands as executed is serialized (there is no point in doing this +concurrently). ------------------------------------------------------------------------------- 5) Represent and manage device memory from core kernel point of view -Several differents design were try to support device memory. First one use -device specific data structure to keep information about migrated memory and -HMM hooked itself in various place of mm code to handle any access to address -that were back by device memory. It turns out that this ended up replicating -most of the fields of struct page and also needed many kernel code path to be -updated to understand this new kind of memory. +Several different designs were tried to support device memory. First one used +a device specific data structure to keep information about migrated memory and +HMM hooked itself in various places of mm code to handle any access to +addresses that were backed by device memory. It turns out that this ended up +replicating most of the fields of struct page and also needed many kernel code +paths to be updated to understand this new kind of memory. -Thing is most kernel code path never try to access the memory behind a page -but only care about struct page contents. Because of this HMM switchted to -directly using struct page for device memory which left most kernel code path -un-aware of the difference. We only need to make sure that no one ever try to -map those page from the CPU side. +Most kernel code paths never try to access the memory behind a page +but only care about struct page contents. Because of this, HMM switched to +directly using struct page for device memory which left most kernel code paths +unaware of the difference. We only need to make sure that no one ever tries to +map those pages from the CPU side. -HMM provide a set of helpers to register and hotplug device memory as a new -region needing struct page. This is offer through a very simple API: +HMM provides a set of helpers to register and hotplug device memory as a new +region needing a struct page. This is offered through a very simple API: struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, struct device *device, @@ -289,18 +300,19 @@ The hmm_devmem_ops is where most of the important things are: }; The first callback (free()) happens when the last reference on a device page is -drop. This means the device page is now free and no longer use by anyone. The -second callback happens whenever CPU try to access a device page which it can -not do. This second callback must trigger a migration back to system memory. +dropped. This means the device page is now free and no longer used by anyone. +The second callback happens whenever the CPU tries to access a device page +which it cannot do. This second callback must trigger a migration back to +system memory. ------------------------------------------------------------------------------- -6) Migrate to and from device memory +6) Migration to and from device memory -Because CPU can not access device memory, migration must use device DMA engine -to perform copy from and to device memory. For this we need a new migration -helper: +Because the CPU cannot access device memory, migration must use the device DMA +engine to perform copy from and to device memory. For this we need a new +migration helper: int migrate_vma(const struct migrate_vma_ops *ops, struct vm_area_struct *vma, @@ -311,15 +323,15 @@ helper: unsigned long *dst, void *private); -Unlike other migration function it works on a range of virtual address, there -is two reasons for that. First device DMA copy has a high setup overhead cost +Unlike other migration functions it works on a range of virtual address, there +are two reasons for that. First, device DMA copy has a high setup overhead cost and thus batching multiple pages is needed as otherwise the migration overhead -make the whole excersie pointless. The second reason is because driver trigger -such migration base on range of address the device is actively accessing. +makes the whole exercise pointless. The second reason is because the +migration might be for a range of addresses the device is actively accessing. -The migrate_vma_ops struct define two callbacks. First one (alloc_and_copy()) -control destination memory allocation and copy operation. Second one is there -to allow device driver to perform cleanup operation after migration. +The migrate_vma_ops struct defines two callbacks. First one (alloc_and_copy()) +controls destination memory allocation and copy operation. Second one is there +to allow the device driver to perform cleanup operations after migration. struct migrate_vma_ops { void (*alloc_and_copy)(struct vm_area_struct *vma, @@ -336,19 +348,19 @@ to allow device driver to perform cleanup operation after migration. void *private); }; -It is important to stress that this migration helpers allow for hole in the +It is important to stress that these migration helpers allow for holes in the virtual address range. Some pages in the range might not be migrated for all -the usual reasons (page is pin, page is lock, ...). This helper does not fail -but just skip over those pages. +the usual reasons (page is pinned, page is locked, ...). This helper does not +fail but just skips over those pages. -The alloc_and_copy() might as well decide to not migrate all pages in the -range (for reasons under the callback control). For those the callback just -have to leave the corresponding dst entry empty. +The alloc_and_copy() might decide to not migrate all pages in the +range (for reasons under the callback control). For those, the callback just +has to leave the corresponding dst entry empty. -Finaly the migration of the struct page might fails (for file back page) for +Finally, the migration of the struct page might fail (for file backed page) for various reasons (failure to freeze reference, or update page cache, ...). If -that happens then the finalize_and_map() can catch any pages that was not -migrated. Note those page were still copied to new page and thus we wasted +that happens, then the finalize_and_map() can catch any pages that were not +migrated. Note those pages were still copied to a new page and thus we wasted bandwidth but this is considered as a rare event and a price that we are willing to pay to keep all the code simpler. @@ -358,27 +370,27 @@ willing to pay to keep all the code simpler. 7) Memory cgroup (memcg) and rss accounting For now device memory is accounted as any regular page in rss counters (either -anonymous if device page is use for anonymous, file if device page is use for -file back page or shmem if device page is use for share memory). This is a -deliberate choice to keep existing application that might start using device -memory without knowing about it to keep runing unimpacted. - -Drawbacks is that OOM killer might kill an application using a lot of device -memory and not a lot of regular system memory and thus not freeing much system -memory. We want to gather more real world experience on how application and -system react under memory pressure in the presence of device memory before +anonymous if device page is used for anonymous, file if device page is used for +file backed page or shmem if device page is used for shared memory). This is a +deliberate choice to keep existing applications, that might start using device +memory without knowing about it, running unimpacted. + +A drawback is that the OOM killer might kill an application using a lot of +device memory and not a lot of regular system memory and thus not freeing much +system memory. We want to gather more real world experience on how applications +and system react under memory pressure in the presence of device memory before deciding to account device memory differently. -Same decision was made for memory cgroup. Device memory page are accounted +Same decision was made for memory cgroup. Device memory pages are accounted against same memory cgroup a regular page would be accounted to. This does simplify migration to and from device memory. This also means that migration -back from device memory to regular memory can not fail because it would +back from device memory to regular memory cannot fail because it would go above memory cgroup limit. We might revisit this choice latter on once we -get more experience in how device memory is use and its impact on memory +get more experience in how device memory is used and its impact on memory resource control. -Note that device memory can never be pin nor by device driver nor through GUP +Note that device memory can never be pinned by device driver nor through GUP and thus such memory is always free upon process exit. Or when last reference -is drop in case of share memory or file back memory. +is dropped in case of shared memory or file backed memory. diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration index 0478ae2ad44a..496868072e24 100644 --- a/Documentation/vm/page_migration +++ b/Documentation/vm/page_migration @@ -90,7 +90,7 @@ Steps: 1. Lock the page to be migrated -2. Insure that writeback is complete. +2. Ensure that writeback is complete. 3. Lock the new page that we want to move to. It is locked so that accesses to this (not yet uptodate) page immediately lock while the move is in progress. @@ -100,8 +100,8 @@ Steps: mapcount is not zero then we do not migrate the page. All user space processes that attempt to access the page will now wait on the page lock. -5. The radix tree lock is taken. This will cause all processes trying - to access the page via the mapping to block on the radix tree spinlock. +5. The i_pages lock is taken. This will cause all processes trying + to access the page via the mapping to block on the spinlock. 6. The refcount of the page is examined and we back out if references remain otherwise we know that we are the only one referencing this page. @@ -114,12 +114,12 @@ Steps: 9. The radix tree is changed to point to the new page. -10. The reference count of the old page is dropped because the radix tree +10. The reference count of the old page is dropped because the address space reference is gone. A reference to the new page is established because - the new page is referenced to by the radix tree. + the new page is referenced by the address space. -11. The radix tree lock is dropped. With that lookups in the mapping - become possible again. Processes will move from spinning on the tree_lock +11. The i_pages lock is dropped. With that lookups in the mapping + become possible again. Processes will move from spinning on the lock to sleeping on the locked new page. 12. The page contents are copied to the new page. diff --git a/MAINTAINERS b/MAINTAINERS index 7bb2e9595f14..c7182d2a9f5c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4392,7 +4392,7 @@ S: Maintained F: drivers/staging/fsl-dpaa2/ethsw DPT_I2O SCSI RAID DRIVER -M: Adaptec OEM Raid Solutions <aacraid@adaptec.com> +M: Adaptec OEM Raid Solutions <aacraid@microsemi.com> L: linux-scsi@vger.kernel.org W: http://www.adaptec.com/ S: Maintained @@ -6410,6 +6410,7 @@ L: linux-mm@kvack.org S: Maintained F: mm/hmm* F: include/linux/hmm* +F: Documentation/vm/hmm.txt HOST AP DRIVER M: Jouni Malinen <j@w1.fi> @@ -7344,7 +7345,7 @@ F: include/linux/ipmi* F: include/uapi/linux/ipmi* IPS SCSI RAID DRIVER -M: Adaptec OEM Raid Solutions <aacraid@adaptec.com> +M: Adaptec OEM Raid Solutions <aacraid@microsemi.com> L: linux-scsi@vger.kernel.org W: http://www.adaptec.com/ S: Maintained @@ -11762,7 +11763,7 @@ F: drivers/char/random.c RAPIDIO SUBSYSTEM M: Matt Porter <mporter@kernel.crashing.org> -M: Alexandre Bounine <alexandre.bounine@idt.com> +M: Alexandre Bounine <alex.bou9@gmail.com> S: Maintained F: drivers/rapidio/ diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 2dbdf59258d9..f9d4e6b6d4bd 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -32,6 +32,7 @@ #define MAP_NONBLOCK 0x40000 /* do not block on IO */ #define MAP_STACK 0x80000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x100000 /* create a huge page mapping */ +#define MAP_FIXED_NOREPLACE 0x200000/* MAP_FIXED which doesn't unmap underlying mapping */ #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_SYNC 2 /* synchronous memory sync */ diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 74504b154256..869080bedb89 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -318,10 +318,8 @@ static inline void flush_anon_page(struct vm_area_struct *vma, #define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE extern void flush_kernel_dcache_page(struct page *); -#define flush_dcache_mmap_lock(mapping) \ - spin_lock_irq(&(mapping)->tree_lock) -#define flush_dcache_mmap_unlock(mapping) \ - spin_unlock_irq(&(mapping)->tree_lock) +#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) +#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) #define flush_icache_user_range(vma,page,addr,len) \ flush_dcache_page(page) diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index 496667703693..ed8fd0d19a3e 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -22,12 +22,6 @@ #include <mach/memory.h> #endif -/* - * Allow for constants defined here to be used from assembly code - * by prepending the UL suffix only with actual C code compilation. - */ -#define UL(x) _AC(x, UL) - /* PAGE_OFFSET - the virtual address of the start of the kernel image */ #define PAGE_OFFSET UL(CONFIG_PAGE_OFFSET) diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index ada8eb206a90..8c398fedbbb6 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -466,6 +466,12 @@ void __init dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) void __init dma_contiguous_remap(void) { int i; + + if (!dma_mmu_remap_num) + return; + + /* call flush_cache_all() since CMA area would be large enough */ + flush_cache_all(); for (i = 0; i < dma_mmu_remap_num; i++) { phys_addr_t start = dma_mmu_remap[i].base; phys_addr_t end = start + dma_mmu_remap[i].size; @@ -498,7 +504,15 @@ void __init dma_contiguous_remap(void) flush_tlb_kernel_range(__phys_to_virt(start), __phys_to_virt(end)); - iotable_init(&map, 1); + /* + * All the memory in CMA region will be on ZONE_MOVABLE. + * If that zone is considered as highmem, the memory in CMA + * region is also considered as highmem even if it's + * physical address belong to lowmem. In this case, + * re-mapping isn't required. + */ + if (!is_highmem_idx(ZONE_MOVABLE)) + iotable_init(&map, 1); } } diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index eb1de66517d5..f866870db749 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -21,20 +21,20 @@ #define MIN_GAP (128*1024*1024UL) #define MAX_GAP ((TASK_SIZE)/6*5) -static int mmap_is_legacy(void) +static int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; } -static unsigned long mmap_base(unsigned long rnd) +static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; if (gap < MIN_GAP) gap = MIN_GAP; @@ -180,18 +180,18 @@ unsigned long arch_mmap_rnd(void) return rnd << PAGE_SHIFT; } -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 7dfcec4700fe..0094c6653b06 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -140,10 +140,8 @@ static inline void __flush_icache_all(void) dsb(ish); } -#define flush_dcache_mmap_lock(mapping) \ - spin_lock_irq(&(mapping)->tree_lock) -#define flush_dcache_mmap_unlock(mapping) \ - spin_unlock_irq(&(mapping)->tree_lock) +#define flush_dcache_mmap_lock(mapping) do { } while (0) +#define flush_dcache_mmap_unlock(mapping) do { } while (0) /* * We don't appear to need to do anything here. In fact, if we did, we'd diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 50fa96a49792..49d99214f43c 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -29,12 +29,6 @@ #include <asm/sizes.h> /* - * Allow for constants defined here to be used from assembly code - * by prepending the UL suffix only with actual C code compilation. - */ -#define UL(x) _AC(x, UL) - -/* * Size of the PCI I/O space. This must remain a power of two so that * IO_SPACE_LIMIT acts as a mask for the low bits of I/O addresses. */ diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index decccffb03ca..842c8a5fcd53 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -38,12 +38,12 @@ #define MIN_GAP (SZ_128M) #define MAX_GAP (STACK_TOP/6*5) -static int mmap_is_legacy(void) +static int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; @@ -62,9 +62,9 @@ unsigned long arch_mmap_rnd(void) return rnd << PAGE_SHIFT; } -static unsigned long mmap_base(unsigned long rnd) +static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = (STACK_RND_MASK << PAGE_SHIFT) + stack_guard_gap; /* Values close to RLIM_INFINITY can overflow. */ @@ -83,7 +83,7 @@ static unsigned long mmap_base(unsigned long rnd) * This function, called very early during the creation of a new process VM * image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; @@ -94,11 +94,11 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * Fall back to the standard layout if the personality bit is set, or * if the expected stack growth is unlimited: */ - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 606e02ca4b6c..3035ca499cd8 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -50,6 +50,7 @@ #define MAP_NONBLOCK 0x20000 /* do not block on IO */ #define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x80000 /* create a huge page mapping */ +#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ /* * Flags for msync diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index 33d3251ecd37..2f616ebeb7e0 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -24,20 +24,20 @@ EXPORT_SYMBOL(shm_align_mask); #define MIN_GAP (128*1024*1024UL) #define MAX_GAP ((TASK_SIZE)/6*5) -static int mmap_is_legacy(void) +static int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; } -static unsigned long mmap_base(unsigned long rnd) +static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; if (gap < MIN_GAP) gap = MIN_GAP; @@ -158,18 +158,18 @@ unsigned long arch_mmap_rnd(void) return rnd << PAGE_SHIFT; } -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/nds32/include/asm/cacheflush.h b/arch/nds32/include/asm/cacheflush.h index 7b9b20a381cb..1240f148ec0f 100644 --- a/arch/nds32/include/asm/cacheflush.h +++ b/arch/nds32/include/asm/cacheflush.h @@ -34,8 +34,8 @@ void flush_anon_page(struct vm_area_struct *vma, void flush_kernel_dcache_page(struct page *page); void flush_icache_range(unsigned long start, unsigned long end); void flush_icache_page(struct vm_area_struct *vma, struct page *page); -#define flush_dcache_mmap_lock(mapping) spin_lock_irq(&(mapping)->tree_lock) -#define flush_dcache_mmap_unlock(mapping) spin_unlock_irq(&(mapping)->tree_lock) +#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&(mapping)->i_pages) +#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&(mapping)->i_pages) #else #include <asm-generic/cacheflush.h> diff --git a/arch/nios2/include/asm/cacheflush.h b/arch/nios2/include/asm/cacheflush.h index 55e383c173f7..18eb9f69f806 100644 --- a/arch/nios2/include/asm/cacheflush.h +++ b/arch/nios2/include/asm/cacheflush.h @@ -46,9 +46,7 @@ extern void copy_from_user_page(struct vm_area_struct *vma, struct page *page, extern void flush_dcache_range(unsigned long start, unsigned long end); extern void invalidate_dcache_range(unsigned long start, unsigned long end); -#define flush_dcache_mmap_lock(mapping) \ - spin_lock_irq(&(mapping)->tree_lock) -#define flush_dcache_mmap_unlock(mapping) \ - spin_unlock_irq(&(mapping)->tree_lock) +#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) +#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) #endif /* _ASM_NIOS2_CACHEFLUSH_H */ diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index bd5ce31936f5..0c83644bfa5c 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -55,10 +55,8 @@ void invalidate_kernel_vmap_range(void *vaddr, int size); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *page); -#define flush_dcache_mmap_lock(mapping) \ - spin_lock_irq(&(mapping)->tree_lock) -#define flush_dcache_mmap_unlock(mapping) \ - spin_unlock_irq(&(mapping)->tree_lock) +#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) +#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) #define flush_icache_page(vma,page) do { \ flush_kernel_dcache_page(page); \ diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index a056a642bb31..870fbf8c7088 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -26,6 +26,7 @@ #define MAP_NONBLOCK 0x20000 /* do not block on IO */ #define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x80000 /* create a huge page mapping */ +#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #define MS_SYNC 1 /* synchronous memory sync */ #define MS_ASYNC 2 /* sync memory asynchronously */ diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c index 8c99ebbe2bac..43b308cfdf53 100644 --- a/arch/parisc/kernel/sys_parisc.c +++ b/arch/parisc/kernel/sys_parisc.c @@ -70,12 +70,18 @@ static inline unsigned long COLOR_ALIGN(unsigned long addr, * Top of mmap area (just below the process stack). */ -static unsigned long mmap_upper_limit(void) +/* + * When called from arch_get_unmapped_area(), rlim_stack will be NULL, + * indicating that "current" should be used instead of a passed-in + * value from the exec bprm as done with arch_pick_mmap_layout(). + */ +static unsigned long mmap_upper_limit(struct rlimit *rlim_stack) { unsigned long stack_base; /* Limit stack size - see setup_arg_pages() in fs/exec.c */ - stack_base = rlimit_max(RLIMIT_STACK); + stack_base = rlim_stack ? rlim_stack->rlim_max + : rlimit_max(RLIMIT_STACK); if (stack_base > STACK_SIZE_MAX) stack_base = STACK_SIZE_MAX; @@ -127,7 +133,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = mm->mmap_legacy_base; - info.high_limit = mmap_upper_limit(); + info.high_limit = mmap_upper_limit(NULL); info.align_mask = last_mmap ? (PAGE_MASK & (SHM_COLOUR - 1)) : 0; info.align_offset = shared_align_offset(last_mmap, pgoff); addr = vm_unmapped_area(&info); @@ -250,10 +256,10 @@ static unsigned long mmap_legacy_base(void) * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_legacy_base = mmap_legacy_base(); - mm->mmap_base = mmap_upper_limit(); + mm->mmap_base = mmap_upper_limit(rlim_stack); if (mmap_is_legacy()) { mm->mmap_base = mm->mmap_legacy_base; diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index d503f344e476..b24ce40acd47 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -39,12 +39,12 @@ #define MIN_GAP (128*1024*1024) #define MAX_GAP (TASK_SIZE/6*5) -static inline int mmap_is_legacy(void) +static inline int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; @@ -76,9 +76,10 @@ static inline unsigned long stack_maxrandom_size(void) return (1<<30); } -static inline unsigned long mmap_base(unsigned long rnd) +static inline unsigned long mmap_base(unsigned long rnd, + struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size() + stack_guard_gap; /* Values close to RLIM_INFINITY can overflow. */ @@ -196,26 +197,28 @@ radix__arch_get_unmapped_area_topdown(struct file *filp, } static void radix__arch_pick_mmap_layout(struct mm_struct *mm, - unsigned long random_factor) + unsigned long random_factor, + struct rlimit *rlim_stack) { - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = radix__arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown; } } #else /* dummy */ extern void radix__arch_pick_mmap_layout(struct mm_struct *mm, - unsigned long random_factor); + unsigned long random_factor, + struct rlimit *rlim_stack); #endif /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; @@ -223,16 +226,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm) random_factor = arch_mmap_rnd(); if (radix_enabled()) - return radix__arch_pick_mmap_layout(mm, random_factor); + return radix__arch_pick_mmap_layout(mm, random_factor, + rlim_stack); /* * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: */ - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index 9a8a084e4aba..4c615fcb0cf0 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -75,8 +75,7 @@ EXPORT_SYMBOL_GPL(mm_iommu_preregistered); /* * Taken from alloc_migrate_target with changes to remove CMA allocations */ -struct page *new_iommu_non_cma_page(struct page *page, unsigned long private, - int **resultp) +struct page *new_iommu_non_cma_page(struct page *page, unsigned long private) { gfp_t gfp_mask = GFP_USER; struct page *new_page; diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 831bdcf407bb..0a7627cdb34e 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -37,11 +37,11 @@ static unsigned long stack_maxrandom_size(void) #define MIN_GAP (32*1024*1024) #define MAX_GAP (STACK_TOP/6*5) -static inline int mmap_is_legacy(void) +static inline int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; } @@ -56,9 +56,10 @@ static unsigned long mmap_base_legacy(unsigned long rnd) return TASK_UNMAPPED_BASE + rnd; } -static inline unsigned long mmap_base(unsigned long rnd) +static inline unsigned long mmap_base(unsigned long rnd, + struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; if (gap < MIN_GAP) gap = MIN_GAP; @@ -184,7 +185,7 @@ check_asce_limit: * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; @@ -195,11 +196,11 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: */ - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = mmap_base_legacy(random_factor); mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 348a17ecdf66..9ef8de63f28b 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -276,7 +276,7 @@ static unsigned long mmap_rnd(void) return rnd << PAGE_SHIFT; } -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = mmap_rnd(); unsigned long gap; @@ -285,7 +285,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: */ - gap = rlimit(RLIMIT_STACK); + gap = rlim_stack->rlim_cur; if (!test_thread_flag(TIF_32BIT) || (current->personality & ADDR_COMPAT_LAYOUT) || gap == RLIM_INFINITY || diff --git a/arch/unicore32/include/asm/cacheflush.h b/arch/unicore32/include/asm/cacheflush.h index a5e08e2d5d6d..1d9132b66039 100644 --- a/arch/unicore32/include/asm/cacheflush.h +++ b/arch/unicore32/include/asm/cacheflush.h @@ -170,10 +170,8 @@ extern void flush_cache_page(struct vm_area_struct *vma, #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); -#define flush_dcache_mmap_lock(mapping) \ - spin_lock_irq(&(mapping)->tree_lock) -#define flush_dcache_mmap_unlock(mapping) \ - spin_unlock_irq(&(mapping)->tree_lock) +#define flush_dcache_mmap_lock(mapping) do { } while (0) +#define flush_dcache_mmap_unlock(mapping) do { } while (0) #define flush_icache_user_range(vma, page, addr, len) \ flush_dcache_page(page) diff --git a/arch/unicore32/include/asm/memory.h b/arch/unicore32/include/asm/memory.h index 3bb0a29fd2d7..66bb9f6525c0 100644 --- a/arch/unicore32/include/asm/memory.h +++ b/arch/unicore32/include/asm/memory.h @@ -20,12 +20,6 @@ #include <mach/memory.h> /* - * Allow for constants defined here to be used from assembly code - * by prepending the UL suffix only with actual C code compilation. - */ -#define UL(x) _AC(x, UL) - -/* * PAGE_OFFSET - the virtual address of the start of the kernel image * TASK_SIZE - the maximum size of a user space task. * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 199e15bd3ec5..ce8b4da07e35 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -122,12 +122,14 @@ struct x86_init_pci { * @guest_late_init: guest late init * @x2apic_available: X2APIC detection * @init_mem_mapping: setup early mappings during init_mem_mapping() + * @init_after_bootmem: guest init after boot allocator is finished */ struct x86_hyper_init { void (*init_platform)(void); void (*guest_late_init)(void); bool (*x2apic_available)(void); void (*init_mem_mapping)(void); + void (*init_after_bootmem)(void); }; /** diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ebda84a91510..3ab867603e81 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -92,6 +92,7 @@ struct x86_init_ops x86_init __initdata = { .guest_late_init = x86_init_noop, .x2apic_available = bool_x86_init_noop, .init_mem_mapping = x86_init_noop, + .init_after_bootmem = x86_init_noop, }, .acpi = { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 396e1f0151ac..8008db2bddb3 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -778,6 +778,7 @@ void __init mem_init(void) free_all_bootmem(); after_bootmem = 1; + x86_init.hyper.init_after_bootmem(); mem_init_print_info(NULL); printk(KERN_INFO "virtual kernel memory layout:\n" diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index dca9abf2b85c..66de40e45f58 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1185,6 +1185,7 @@ void __init mem_init(void) /* this will put all memory onto the freelists */ free_all_bootmem(); after_bootmem = 1; + x86_init.hyper.init_after_bootmem(); /* * Must be done after boot memory is put on freelist, because here we diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 155ecbac9e28..48c591251600 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -90,9 +90,10 @@ unsigned long arch_mmap_rnd(void) return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits); } -static unsigned long mmap_base(unsigned long rnd, unsigned long task_size) +static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, + struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap; unsigned long gap_min, gap_max; @@ -126,16 +127,17 @@ static unsigned long mmap_legacy_base(unsigned long rnd, * process VM image, sets up which VM layout function to use: */ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, - unsigned long random_factor, unsigned long task_size) + unsigned long random_factor, unsigned long task_size, + struct rlimit *rlim_stack) { *legacy_base = mmap_legacy_base(random_factor, task_size); if (mmap_is_legacy()) *base = *legacy_base; else - *base = mmap_base(random_factor, task_size); + *base = mmap_base(random_factor, task_size, rlim_stack); } -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { if (mmap_is_legacy()) mm->get_unmapped_area = arch_get_unmapped_area; @@ -143,7 +145,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->get_unmapped_area = arch_get_unmapped_area_topdown; arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, - arch_rnd(mmap64_rnd_bits), task_size_64bit(0)); + arch_rnd(mmap64_rnd_bits), task_size_64bit(0), + rlim_stack); #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES /* @@ -153,7 +156,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * mmap_base, the compat syscall uses mmap_compat_base. */ arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, - arch_rnd(mmap32_rnd_bits), task_size_32bit()); + arch_rnd(mmap32_rnd_bits), task_size_32bit(), + rlim_stack); #endif } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index d20763472920..486c0a34d00b 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -116,6 +116,8 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ static phys_addr_t xen_pt_base, xen_pt_size __initdata; +static DEFINE_STATIC_KEY_FALSE(xen_struct_pages_ready); + /* * Just beyond the highest usermode address. STACK_TOP_MAX has a * redzone above it, so round it up to a PGD boundary. @@ -155,11 +157,18 @@ void make_lowmem_page_readwrite(void *vaddr) } +/* + * During early boot all page table pages are pinned, but we do not have struct + * pages, so return true until struct pages are ready. + */ static bool xen_page_pinned(void *ptr) { - struct page *page = virt_to_page(ptr); + if (static_branch_likely(&xen_struct_pages_ready)) { + struct page *page = virt_to_page(ptr); - return PagePinned(page); + return PagePinned(page); + } + return true; } static void xen_extend_mmu_update(const struct mmu_update *update) @@ -836,11 +845,6 @@ void xen_mm_pin_all(void) spin_unlock(&pgd_lock); } -/* - * The init_mm pagetable is really pinned as soon as its created, but - * that's before we have page structures to store the bits. So do all - * the book-keeping now. - */ static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, enum pt_level level) { @@ -848,8 +852,18 @@ static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, return 0; } -static void __init xen_mark_init_mm_pinned(void) +/* + * The init_mm pagetable is really pinned as soon as its created, but + * that's before we have page structures to store the bits. So do all + * the book-keeping now once struct pages for allocated pages are + * initialized. This happens only after free_all_bootmem() is called. + */ +static void __init xen_after_bootmem(void) { + static_branch_enable(&xen_struct_pages_ready); +#ifdef CONFIG_X86_64 + SetPagePinned(virt_to_page(level3_user_vsyscall)); +#endif xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); } @@ -1623,14 +1637,15 @@ static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot) static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) { - bool pinned = PagePinned(virt_to_page(mm->pgd)); + bool pinned = xen_page_pinned(mm->pgd); trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned); if (pinned) { struct page *page = pfn_to_page(pfn); - SetPagePinned(page); + if (static_branch_likely(&xen_struct_pages_ready)) + SetPagePinned(page); if (!PageHighMem(page)) { xen_mc_batch(); @@ -2364,9 +2379,7 @@ static void __init xen_post_allocator_init(void) #ifdef CONFIG_X86_64 pv_mmu_ops.write_cr3 = &xen_write_cr3; - SetPagePinned(virt_to_page(level3_user_vsyscall)); #endif - xen_mark_init_mm_pinned(); } static void xen_leave_lazy_mmu(void) @@ -2450,6 +2463,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { x86_init.paging.pagetable_init = xen_pagetable_init; + x86_init.hyper.init_after_bootmem = xen_after_bootmem; pv_mmu_ops = xen_mmu_ops; diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 3e9d01ada81f..58f29a9d895d 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -57,6 +57,7 @@ #define MAP_NONBLOCK 0x20000 /* do not block on IO */ #define MAP_STACK 0x40000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x80000 /* create a huge page mapping */ +#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED # define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 79fcd2bae96b..bffe8616bd55 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -837,11 +837,8 @@ int __init memory_dev_init(void) * during boot and have been initialized */ mutex_lock(&mem_sysfs_mutex); - for (i = 0; i < NR_MEM_SECTIONS; i += sections_per_block) { - /* Don't iterate over sections we know are !present: */ - if (i > __highest_present_section_nr) - break; - + for (i = 0; i <= __highest_present_section_nr; + i += sections_per_block) { err = add_memory_block(i); if (!ret) ret = err; diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 6afe896e5cb8..96d26cfae90b 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -253,7 +253,7 @@ static inline void hwsim_clear_chanctx_magic(struct ieee80211_chanctx_conf *c) static unsigned int hwsim_net_id; -static struct ida hwsim_netgroup_ida = IDA_INIT; +static DEFINE_IDA(hwsim_netgroup_ida); struct hwsim_net { int netgroup; diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c index 02c5984ab09b..6bb37c18292a 100644 --- a/drivers/of/unittest.c +++ b/drivers/of/unittest.c @@ -295,7 +295,7 @@ static void __init of_unittest_printf(void) return; } - num_to_str(phandle_str, sizeof(phandle_str), np->phandle); + num_to_str(phandle_str, sizeof(phandle_str), np->phandle, 0); of_unittest_printf_one(np, "%pOF", full_name); of_unittest_printf_one(np, "%pOFf", full_name); diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index cfb54e01d758..9d27016c899e 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -212,7 +212,6 @@ struct mport_cdev_priv { #ifdef CONFIG_RAPIDIO_DMA_ENGINE struct dma_chan *dmach; struct list_head async_list; - struct list_head pend_list; spinlock_t req_lock; struct mutex dma_lock; struct kref dma_ref; @@ -258,8 +257,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mport_cdev_wait); static struct class *dev_class; static dev_t dev_number; -static struct workqueue_struct *dma_wq; - static void mport_release_mapping(struct kref *ref); static int rio_mport_maint_rd(struct mport_cdev_priv *priv, void __user *arg, @@ -539,6 +536,7 @@ static int maint_comptag_set(struct mport_cdev_priv *priv, void __user *arg) #ifdef CONFIG_RAPIDIO_DMA_ENGINE struct mport_dma_req { + struct kref refcount; struct list_head node; struct file *filp; struct mport_cdev_priv *priv; @@ -554,11 +552,6 @@ struct mport_dma_req { struct completion req_comp; }; -struct mport_faf_work { - struct work_struct work; - struct mport_dma_req *req; -}; - static void mport_release_def_dma(struct kref *dma_ref) { struct mport_dev *md = @@ -578,8 +571,10 @@ static void mport_release_dma(struct kref *dma_ref) complete(&priv->comp); } -static void dma_req_free(struct mport_dma_req *req) +static void dma_req_free(struct kref *ref) { + struct mport_dma_req *req = container_of(ref, struct mport_dma_req, + refcount); struct mport_cdev_priv *priv = req->priv; unsigned int i; @@ -611,30 +606,7 @@ static void dma_xfer_callback(void *param) req->status = dma_async_is_tx_complete(priv->dmach, req->cookie, NULL, NULL); complete(&req->req_comp); -} - -static void dma_faf_cleanup(struct work_struct *_work) -{ - struct mport_faf_work *work = container_of(_work, - struct mport_faf_work, work); - struct mport_dma_req *req = work->req; - - dma_req_free(req); - kfree(work); -} - -static void dma_faf_callback(void *param) -{ - struct mport_dma_req *req = (struct mport_dma_req *)param; - struct mport_faf_work *work; - - work = kmalloc(sizeof(*work), GFP_ATOMIC); - if (!work) - return; - - INIT_WORK(&work->work, dma_faf_cleanup); - work->req = req; - queue_work(dma_wq, &work->work); + kref_put(&req->refcount, dma_req_free); } /* @@ -765,16 +737,14 @@ static int do_dma_request(struct mport_dma_req *req, goto err_out; } - if (sync == RIO_TRANSFER_FAF) - tx->callback = dma_faf_callback; - else - tx->callback = dma_xfer_callback; + tx->callback = dma_xfer_callback; tx->callback_param = req; req->dmach = chan; req->sync = sync; req->status = DMA_IN_PROGRESS; init_completion(&req->req_comp); + kref_get(&req->refcount); cookie = dmaengine_submit(tx); req->cookie = cookie; @@ -785,6 +755,7 @@ static int do_dma_request(struct mport_dma_req *req, if (dma_submit_error(cookie)) { rmcd_error("submit err=%d (addr:0x%llx len:0x%llx)", cookie, xfer->rio_addr, xfer->length); + kref_put(&req->refcount, dma_req_free); ret = -EIO; goto err_out; } @@ -860,6 +831,8 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, if (!req) return -ENOMEM; + kref_init(&req->refcount); + ret = get_dma_channel(priv); if (ret) { kfree(req); @@ -968,42 +941,20 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, ret = do_dma_request(req, xfer, sync, nents); if (ret >= 0) { - if (sync == RIO_TRANSFER_SYNC) - goto sync_out; - return ret; /* return ASYNC cookie */ - } - - if (ret == -ETIMEDOUT || ret == -EINTR) { - /* - * This can happen only in case of SYNC transfer. - * Do not free unfinished request structure immediately. - * Place it into pending list and deal with it later - */ - spin_lock(&priv->req_lock); - list_add_tail(&req->node, &priv->pend_list); - spin_unlock(&priv->req_lock); - return ret; + if (sync == RIO_TRANSFER_ASYNC) + return ret; /* return ASYNC cookie */ + } else { + rmcd_debug(DMA, "do_dma_request failed with err=%d", ret); } - - rmcd_debug(DMA, "do_dma_request failed with err=%d", ret); -sync_out: - dma_unmap_sg(chan->device->dev, req->sgt.sgl, req->sgt.nents, dir); - sg_free_table(&req->sgt); err_pg: - if (page_list) { + if (!req->page_list) { for (i = 0; i < nr_pages; i++) put_page(page_list[i]); kfree(page_list); } err_req: - if (req->map) { - mutex_lock(&md->buf_mutex); - kref_put(&req->map->ref, mport_release_mapping); - mutex_unlock(&md->buf_mutex); - } - put_dma_channel(priv); - kfree(req); + kref_put(&req->refcount, dma_req_free); return ret; } @@ -1121,7 +1072,7 @@ static int rio_mport_wait_for_async_dma(struct file *filp, void __user *arg) ret = 0; if (req->status != DMA_IN_PROGRESS && req->status != DMA_PAUSED) - dma_req_free(req); + kref_put(&req->refcount, dma_req_free); return ret; @@ -1966,7 +1917,6 @@ static int mport_cdev_open(struct inode *inode, struct file *filp) #ifdef CONFIG_RAPIDIO_DMA_ENGINE INIT_LIST_HEAD(&priv->async_list); - INIT_LIST_HEAD(&priv->pend_list); spin_lock_init(&priv->req_lock); mutex_init(&priv->dma_lock); #endif @@ -2006,8 +1956,6 @@ static void mport_cdev_release_dma(struct file *filp) md = priv->md; - flush_workqueue(dma_wq); - spin_lock(&priv->req_lock); if (!list_empty(&priv->async_list)) { rmcd_debug(EXIT, "async list not empty filp=%p %s(%d)", @@ -2023,20 +1971,7 @@ static void mport_cdev_release_dma(struct file *filp) req->filp, req->cookie, completion_done(&req->req_comp)?"yes":"no"); list_del(&req->node); - dma_req_free(req); - } - } - - if (!list_empty(&priv->pend_list)) { - rmcd_debug(EXIT, "Free pending DMA requests for filp=%p %s(%d)", - filp, current->comm, task_pid_nr(current)); - list_for_each_entry_safe(req, - req_next, &priv->pend_list, node) { - rmcd_debug(EXIT, "free req->filp=%p cookie=%d compl=%s", - req->filp, req->cookie, - completion_done(&req->req_comp)?"yes":"no"); - list_del(&req->node); - dma_req_free(req); + kref_put(&req->refcount, dma_req_free); } } @@ -2048,15 +1983,6 @@ static void mport_cdev_release_dma(struct file *filp) current->comm, task_pid_nr(current), wret); } - spin_lock(&priv->req_lock); - - if (!list_empty(&priv->pend_list)) { - rmcd_debug(EXIT, "ATTN: pending DMA requests, filp=%p %s(%d)", - filp, current->comm, task_pid_nr(current)); - } - - spin_unlock(&priv->req_lock); - if (priv->dmach != priv->md->dma_chan) { rmcd_debug(EXIT, "Release DMA channel for filp=%p %s(%d)", filp, current->comm, task_pid_nr(current)); @@ -2573,8 +2499,6 @@ static void mport_cdev_remove(struct mport_dev *md) cdev_device_del(&md->cdev, &md->dev); mport_cdev_kill_fasync(md); - flush_workqueue(dma_wq); - /* TODO: do we need to give clients some time to close file * descriptors? Simple wait for XX, or kref? */ @@ -2691,17 +2615,8 @@ static int __init mport_init(void) goto err_cli; } - dma_wq = create_singlethread_workqueue("dma_wq"); - if (!dma_wq) { - rmcd_error("failed to create DMA work queue"); - ret = -ENOMEM; - goto err_wq; - } - return 0; -err_wq: - class_interface_unregister(&rio_mport_interface); err_cli: unregister_chrdev_region(dev_number, RIO_MAX_MPORTS); err_chr: @@ -2717,7 +2632,6 @@ static void __exit mport_exit(void) class_interface_unregister(&rio_mport_interface); class_destroy(dev_class); unregister_chrdev_region(dev_number, RIO_MAX_MPORTS); - destroy_workqueue(dma_wq); } module_init(mport_init); diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c index 23429bdaca84..161b927d9de1 100644 --- a/drivers/rapidio/rio-scan.c +++ b/drivers/rapidio/rio-scan.c @@ -76,7 +76,7 @@ static u16 rio_destid_alloc(struct rio_net *net) } /** - * rio_destid_reserve - Reserve the specivied destID + * rio_destid_reserve - Reserve the specified destID * @net: RIO network * @destid: destID to reserve * @@ -885,7 +885,7 @@ static struct rio_net *rio_scan_alloc_net(struct rio_mport *mport, * * For each enumerated device, ensure that each switch in a system * has correct routing entries. Add routes for devices that where - * unknown dirung the first enumeration pass through the switch. + * unknown during the first enumeration pass through the switch. */ static void rio_update_route_tables(struct rio_net *net) { @@ -983,7 +983,7 @@ static int rio_enum_mport(struct rio_mport *mport, u32 flags) /* reserve mport destID in new net */ rio_destid_reserve(net, mport->host_deviceid); - /* Enable Input Output Port (transmitter reviever) */ + /* Enable Input Output Port (transmitter receiver) */ rio_enable_rx_tx_port(mport, 1, 0, 0, 0); /* Set component tag for host */ diff --git a/drivers/staging/lustre/lustre/llite/glimpse.c b/drivers/staging/lustre/lustre/llite/glimpse.c index c43ac574274c..3075358f3f08 100644 --- a/drivers/staging/lustre/lustre/llite/glimpse.c +++ b/drivers/staging/lustre/lustre/llite/glimpse.c @@ -69,7 +69,7 @@ blkcnt_t dirty_cnt(struct inode *inode) void *results[1]; if (inode->i_mapping) - cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree, + cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->i_pages, results, 0, 1, PAGECACHE_TAG_DIRTY); if (cnt == 0 && atomic_read(&vob->vob_mmap_cnt) > 0) diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c index 3b1c8e5a3053..8ee7b4d273b2 100644 --- a/drivers/staging/lustre/lustre/mdc/mdc_request.c +++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c @@ -934,14 +934,14 @@ static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash, struct page *page; int found; - spin_lock_irq(&mapping->tree_lock); - found = radix_tree_gang_lookup(&mapping->page_tree, + xa_lock_irq(&mapping->i_pages); + found = radix_tree_gang_lookup(&mapping->i_pages, (void **)&page, offset, 1); if (found > 0 && !radix_tree_exceptional_entry(page)) { struct lu_dirpage *dp; get_page(page); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* * In contrast to find_lock_page() we are sure that directory * page cannot be truncated (while DLM lock is held) and, @@ -989,7 +989,7 @@ static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash, page = ERR_PTR(-EIO); } } else { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); page = NULL; } return page; diff --git a/fs/afs/write.c b/fs/afs/write.c index 9370e2feb999..dbc3c0b0142d 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -570,10 +570,11 @@ static int afs_writepages_region(struct address_space *mapping, _debug("wback %lx", page->index); - /* at this point we hold neither mapping->tree_lock nor lock on - * the page itself: the page may be truncated or invalidated - * (changing page->mapping to NULL), or even swizzled back from - * swapper_space to tmpfs file mapping + /* + * at this point we hold neither the i_pages lock nor the + * page lock: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled + * back from swapper_space to tmpfs file mapping */ ret = lock_page_killable(page); if (ret < 0) { diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index a0c57c37fa21..be9c3dc048ab 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -19,9 +19,6 @@ */ static autofs_wqt_t autofs4_next_wait_queue = 1; -/* These are the signals we allow interrupting a pending mount */ -#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT)) - void autofs4_catatonic_mode(struct autofs_sb_info *sbi) { struct autofs_wait_queue *wq, *nwq; @@ -486,29 +483,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, * wq->name.name is NULL iff the lock is already released * or the mount has been made catatonic. */ - if (wq->name.name) { - /* Block all but "shutdown" signals while waiting */ - unsigned long shutdown_sigs_mask; - unsigned long irqflags; - sigset_t oldset; - - spin_lock_irqsave(¤t->sighand->siglock, irqflags); - oldset = current->blocked; - shutdown_sigs_mask = SHUTDOWN_SIGS & ~oldset.sig[0]; - siginitsetinv(¤t->blocked, shutdown_sigs_mask); - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); - - wait_event_interruptible(wq->queue, wq->name.name == NULL); - - spin_lock_irqsave(¤t->sighand->siglock, irqflags); - current->blocked = oldset; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); - } else { - pr_debug("skipped sleeping\n"); - } - + wait_event_killable(wq->queue, wq->name.name == NULL); status = wq->status; /* @@ -574,7 +549,7 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok kfree(wq->name.name); wq->name.name = NULL; /* Do not wait on this queue */ wq->status = status; - wake_up_interruptible(&wq->queue); + wake_up(&wq->queue); if (!--wq->wait_ctr) kfree(wq); mutex_unlock(&sbi->wq_mutex); diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index ce1824f47ba6..c3deb2e35f20 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -330,6 +330,7 @@ beyond_if: #ifdef __alpha__ regs->gp = ex.a_gpvalue; #endif + finalize_exec(bprm); start_thread(regs, ex.a_entry, current->mm->start_stack); return 0; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index bdb201230bae..41e04183e4ce 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -377,6 +377,11 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, } else map_addr = vm_mmap(filep, addr, size, prot, type, off); + if ((type & MAP_FIXED_NOREPLACE) && BAD_ADDR(map_addr)) + pr_info("%d (%s): Uhuuh, elf segment at %p requested but the memory is mapped already\n", + task_pid_nr(current), current->comm, + (void *)addr); + return(map_addr); } @@ -575,7 +580,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, elf_prot |= PROT_EXEC; vaddr = eppnt->p_vaddr; if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) - elf_type |= MAP_FIXED; + elf_type |= MAP_FIXED_NOREPLACE; else if (no_base && interp_elf_ex->e_type == ET_DYN) load_addr = -vaddr; @@ -890,7 +895,7 @@ static int load_elf_binary(struct linux_binprm *bprm) the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { - int elf_prot = 0, elf_flags; + int elf_prot = 0, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE; unsigned long k, vaddr; unsigned long total_size = 0; @@ -922,6 +927,13 @@ static int load_elf_binary(struct linux_binprm *bprm) */ } } + + /* + * Some binaries have overlapping elf segments and then + * we have to forcefully map over an existing mapping + * e.g. over this newly established brk mapping. + */ + elf_fixed = MAP_FIXED; } if (elf_ppnt->p_flags & PF_R) @@ -939,7 +951,7 @@ static int load_elf_binary(struct linux_binprm *bprm) * the ET_DYN load_addr calculations, proceed normally. */ if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { - elf_flags |= MAP_FIXED; + elf_flags |= elf_fixed; } else if (loc->elf_ex.e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program @@ -975,7 +987,7 @@ static int load_elf_binary(struct linux_binprm *bprm) load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); - elf_flags |= MAP_FIXED; + elf_flags |= elf_fixed; } else load_bias = 0; @@ -1155,6 +1167,7 @@ static int load_elf_binary(struct linux_binprm *bprm) ELF_PLAT_INIT(regs, reloc_func_desc); #endif + finalize_exec(bprm); start_thread(regs, elf_entry, bprm->p); retval = 0; out: @@ -1234,7 +1247,7 @@ static int load_elf_library(struct file *file) (eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr)), PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, + MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_DENYWRITE, (eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr))); if (error != ELF_PAGESTART(eppnt->p_vaddr)) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 429326b6e2e7..d90993adeffa 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -463,6 +463,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) dynaddr); #endif + finalize_exec(bprm); /* everything is now ready... get the userspace context ready to roll */ entryaddr = interp_params.entry_addr ?: exec_params.entry_addr; start_thread(regs, entryaddr, current->mm->start_stack); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 5d6b94475f27..82a48e830018 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -994,6 +994,7 @@ static int load_flat_binary(struct linux_binprm *bprm) FLAT_PLAT_INIT(regs); #endif + finalize_exec(bprm); pr_debug("start_thread(regs=0x%p, entry=0x%lx, start_stack=0x%lx)\n", regs, start_addr, current->mm->start_stack); start_thread(regs, start_addr, current->mm->start_stack); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 562c3e633403..578181cd96b5 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -458,7 +458,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, break; rcu_read_lock(); - page = radix_tree_lookup(&mapping->page_tree, pg_index); + page = radix_tree_lookup(&mapping->i_pages, pg_index); rcu_read_unlock(); if (page && !radix_tree_exceptional_entry(page)) { misses++; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 47a8fe9d22e8..cf87976e389d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3963,11 +3963,11 @@ retry: done_index = page->index; /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping + * At this point we hold neither the i_pages lock nor + * the page lock: the page may be truncated or + * invalidated (changing page->mapping to NULL), + * or even swizzled back from swapper_space to + * tmpfs file mapping */ if (!trylock_page(page)) { flush_write_bio(epd); @@ -5174,13 +5174,13 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(!PagePrivate(page)); clear_page_dirty_for_io(page); - spin_lock_irq(&page->mapping->tree_lock); + xa_lock_irq(&page->mapping->i_pages); if (!PageDirty(page)) { - radix_tree_tag_clear(&page->mapping->page_tree, + radix_tree_tag_clear(&page->mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&page->mapping->tree_lock); + xa_unlock_irq(&page->mapping->i_pages); ClearPageError(page); unlock_page(page); } diff --git a/fs/buffer.c b/fs/buffer.c index ec5dd39071e6..f3491074b035 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -185,10 +185,9 @@ EXPORT_SYMBOL(end_buffer_write_sync); * we get exclusion from try_to_free_buffers with the blockdev mapping's * private_lock. * - * Hack idea: for the blockdev mapping, i_bufferlist_lock contention + * Hack idea: for the blockdev mapping, private_lock contention * may be quite high. This code could TryLock the page, and if that - * succeeds, there is no need to take private_lock. (But if - * private_lock is contended then so is mapping->tree_lock). + * succeeds, there is no need to take private_lock. */ static struct buffer_head * __find_get_block_slow(struct block_device *bdev, sector_t block) @@ -594,20 +593,21 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * * The caller must hold lock_page_memcg(). */ -static void __set_page_dirty(struct page *page, struct address_space *mapping, +void __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { unsigned long flags; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); } +EXPORT_SYMBOL_GPL(__set_page_dirty); /* * Add a page to the dirty page list. @@ -1095,7 +1095,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, - * mapping->tree_lock and mapping->host->i_lock. + * i_pages lock and mapping->host->i_lock. */ void mark_buffer_dirty(struct buffer_head *bh) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7cee97b93a61..4bcd4e838b47 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1987,11 +1987,10 @@ wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages, for (i = 0; i < found_pages; i++) { page = wdata->pages[i]; /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping + * At this point we hold neither the i_pages lock nor the + * page lock: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled + * back from swapper_space to tmpfs file mapping */ if (nr_pages == 0) @@ -158,11 +158,9 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo } /* - * We do not necessarily hold the mapping->tree_lock when we call this - * function so it is possible that 'entry' is no longer a valid item in the - * radix tree. This is okay because all we really need to do is to find the - * correct waitqueue where tasks might be waiting for that old 'entry' and - * wake them. + * @entry may no longer be the entry at the index in the mapping. + * The important information it's conveying is whether the entry at + * this index used to be a PMD entry. */ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, pgoff_t index, void *entry, bool wake_all) @@ -174,7 +172,7 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, /* * Checking for locked entry and prepare_to_wait_exclusive() happens - * under mapping->tree_lock, ditto for entry handling in our callers. + * under the i_pages lock, ditto for entry handling in our callers. * So at this point all tasks that could have seen our entry locked * must be in the waitqueue and the following check will see them. */ @@ -183,41 +181,39 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, } /* - * Check whether the given slot is locked. The function must be called with - * mapping->tree_lock held + * Check whether the given slot is locked. Must be called with the i_pages + * lock held. */ static inline int slot_locked(struct address_space *mapping, void **slot) { unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); return entry & RADIX_DAX_ENTRY_LOCK; } /* - * Mark the given slot is locked. The function must be called with - * mapping->tree_lock held + * Mark the given slot as locked. Must be called with the i_pages lock held. */ static inline void *lock_slot(struct address_space *mapping, void **slot) { unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); entry |= RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); + radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); return (void *)entry; } /* - * Mark the given slot is unlocked. The function must be called with - * mapping->tree_lock held + * Mark the given slot as unlocked. Must be called with the i_pages lock held. */ static inline void *unlock_slot(struct address_space *mapping, void **slot) { unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); + radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); return (void *)entry; } @@ -228,7 +224,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot) * put_locked_mapping_entry() when he locked the entry and now wants to * unlock it. * - * The function must be called with mapping->tree_lock held. + * Must be called with the i_pages lock held. */ static void *get_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void ***slotp) @@ -241,7 +237,7 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, ewait.wait.func = wake_exceptional_entry_func; for (;;) { - entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, + entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || @@ -254,10 +250,10 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); schedule(); finish_wait(wq, &ewait.wait); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } } @@ -266,15 +262,15 @@ static void dax_unlock_mapping_entry(struct address_space *mapping, { void *entry, **slot; - spin_lock_irq(&mapping->tree_lock); - entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); + xa_lock_irq(&mapping->i_pages); + entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || !slot_locked(mapping, slot))) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return; } unlock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); dax_wake_mapping_entry_waiter(mapping, index, entry, false); } @@ -388,7 +384,7 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, void *entry, **slot; restart: - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, &slot); if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { @@ -420,12 +416,12 @@ restart: if (pmd_downgrade) { /* * Make sure 'entry' remains valid while we drop - * mapping->tree_lock. + * the i_pages lock. */ entry = lock_slot(mapping, slot); } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* * Besides huge zero pages the only other thing that gets * downgraded are empty entries which don't need to be @@ -442,27 +438,27 @@ restart: put_locked_mapping_entry(mapping, index); return ERR_PTR(err); } - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); if (!entry) { /* - * We needed to drop the page_tree lock while calling + * We needed to drop the i_pages lock while calling * radix_tree_preload() and we didn't have an entry to * lock. See if another thread inserted an entry at * our index during this time. */ - entry = __radix_tree_lookup(&mapping->page_tree, index, + entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (entry) { radix_tree_preload_end(); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); goto restart; } } if (pmd_downgrade) { dax_disassociate_entry(entry, mapping, false); - radix_tree_delete(&mapping->page_tree, index); + radix_tree_delete(&mapping->i_pages, index); mapping->nrexceptional--; dax_wake_mapping_entry_waiter(mapping, index, entry, true); @@ -470,11 +466,11 @@ restart: entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); - err = __radix_tree_insert(&mapping->page_tree, index, + err = __radix_tree_insert(&mapping->i_pages, index, dax_radix_order(entry), entry); radix_tree_preload_end(); if (err) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* * Our insertion of a DAX entry failed, most likely * because we were inserting a PMD entry and it @@ -487,12 +483,12 @@ restart: } /* Good, we have inserted empty locked entry into the tree. */ mapping->nrexceptional++; - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return entry; } entry = lock_slot(mapping, slot); out_unlock: - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return entry; } @@ -501,23 +497,23 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, { int ret = 0; void *entry; - struct radix_tree_root *page_tree = &mapping->page_tree; + struct radix_tree_root *pages = &mapping->i_pages; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(pages); entry = get_unlocked_mapping_entry(mapping, index, NULL); if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) goto out; if (!trunc && - (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || - radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) + (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) || + radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))) goto out; dax_disassociate_entry(entry, mapping, trunc); - radix_tree_delete(page_tree, index); + radix_tree_delete(pages, index); mapping->nrexceptional--; ret = 1; out: put_unlocked_mapping_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(pages); return ret; } /* @@ -587,7 +583,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, void *entry, pfn_t pfn_t, unsigned long flags, bool dirty) { - struct radix_tree_root *page_tree = &mapping->page_tree; + struct radix_tree_root *pages = &mapping->i_pages; unsigned long pfn = pfn_t_to_pfn(pfn_t); pgoff_t index = vmf->pgoff; void *new_entry; @@ -604,7 +600,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, unmap_mapping_pages(mapping, vmf->pgoff, 1, false); } - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(pages); new_entry = dax_radix_locked_entry(pfn, flags); if (dax_entry_size(entry) != dax_entry_size(new_entry)) { dax_disassociate_entry(entry, mapping, false); @@ -624,17 +620,17 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, void **slot; void *ret; - ret = __radix_tree_lookup(page_tree, index, &node, &slot); + ret = __radix_tree_lookup(pages, index, &node, &slot); WARN_ON_ONCE(ret != entry); - __radix_tree_replace(page_tree, node, slot, + __radix_tree_replace(pages, node, slot, new_entry, NULL); entry = new_entry; } if (dirty) - radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); + radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(pages); return entry; } @@ -723,7 +719,7 @@ unlock_pte: static int dax_writeback_one(struct dax_device *dax_dev, struct address_space *mapping, pgoff_t index, void *entry) { - struct radix_tree_root *page_tree = &mapping->page_tree; + struct radix_tree_root *pages = &mapping->i_pages; void *entry2, **slot; unsigned long pfn; long ret = 0; @@ -736,7 +732,7 @@ static int dax_writeback_one(struct dax_device *dax_dev, if (WARN_ON(!radix_tree_exceptional_entry(entry))) return -EIO; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(pages); entry2 = get_unlocked_mapping_entry(mapping, index, &slot); /* Entry got punched out / reallocated? */ if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) @@ -755,7 +751,7 @@ static int dax_writeback_one(struct dax_device *dax_dev, } /* Another fsync thread may have already written back this entry */ - if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) + if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)) goto put_unlocked; /* Lock the entry to serialize with page faults */ entry = lock_slot(mapping, slot); @@ -763,11 +759,11 @@ static int dax_writeback_one(struct dax_device *dax_dev, * We can clear the tag now but we have to be careful so that concurrent * dax_writeback_one() calls for the same index cannot finish before we * actually flush the caches. This is achieved as the calls will look - * at the entry only under tree_lock and once they do that they will - * see the entry locked and wait for it to unlock. + * at the entry only under the i_pages lock and once they do that + * they will see the entry locked and wait for it to unlock. */ - radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); - spin_unlock_irq(&mapping->tree_lock); + radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE); + xa_unlock_irq(pages); /* * Even if dax_writeback_mapping_range() was given a wbc->range_start @@ -787,16 +783,16 @@ static int dax_writeback_one(struct dax_device *dax_dev, * the pfn mappings are writeprotected and fault waits for mapping * entry lock. */ - spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&mapping->tree_lock); + xa_lock_irq(pages); + radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY); + xa_unlock_irq(pages); trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); put_locked_mapping_entry(mapping, index); return ret; put_unlocked: put_unlocked_mapping_entry(mapping, index, entry2); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(pages); return ret; } @@ -1566,21 +1562,21 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, pgoff_t index = vmf->pgoff; int vmf_ret, error; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, &slot); /* Did we race with someone splitting entry or so? */ if (!entry || (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { put_unlocked_mapping_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } - radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); + radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY); entry = lock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); switch (pe_size) { case PE_SIZE_PTE: error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); diff --git a/fs/dcache.c b/fs/dcache.c index 593079176123..86d2de63461e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -257,11 +257,25 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } +static void __d_free_external_name(struct rcu_head *head) +{ + struct external_name *name = container_of(head, struct external_name, + u.head); + + mod_node_page_state(page_pgdat(virt_to_page(name)), + NR_INDIRECTLY_RECLAIMABLE_BYTES, + -ksize(name)); + + kfree(name); +} + static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); - kfree(external_name(dentry)); - kmem_cache_free(dentry_cache, dentry); + + __d_free_external_name(&external_name(dentry)->u.head); + + kmem_cache_free(dentry_cache, dentry); } static inline int dname_external(const struct dentry *dentry) @@ -291,7 +305,7 @@ void release_dentry_name_snapshot(struct name_snapshot *name) struct external_name *p; p = container_of(name->name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->u.count))) - kfree_rcu(p, u.head); + call_rcu(&p->u.head, __d_free_external_name); } } EXPORT_SYMBOL(release_dentry_name_snapshot); @@ -1038,6 +1052,8 @@ static void shrink_dentry_list(struct list_head *list) while (!list_empty(list)) { struct dentry *dentry, *parent; + cond_resched(); + dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); @@ -1191,7 +1207,6 @@ void shrink_dcache_sb(struct super_block *sb) this_cpu_sub(nr_dentry_unused, freed); shrink_dentry_list(&dispose); - cond_resched(); } while (list_lru_count(&sb->s_dentry_lru) > 0); } EXPORT_SYMBOL(shrink_dcache_sb); @@ -1473,7 +1488,6 @@ void shrink_dcache_parent(struct dentry *parent) break; shrink_dentry_list(&data.dispose); - cond_resched(); } } EXPORT_SYMBOL(shrink_dcache_parent); @@ -1600,7 +1614,6 @@ void d_invalidate(struct dentry *dentry) detach_mounts(data.mountpoint); dput(data.mountpoint); } - cond_resched(); } } EXPORT_SYMBOL(d_invalidate); @@ -1617,6 +1630,7 @@ EXPORT_SYMBOL(d_invalidate); struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { + struct external_name *ext = NULL; struct dentry *dentry; char *dname; int err; @@ -1637,14 +1651,14 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dname = dentry->d_iname; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); - struct external_name *p = kmalloc(size + name->len, - GFP_KERNEL_ACCOUNT); - if (!p) { + + ext = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT); + if (!ext) { kmem_cache_free(dentry_cache, dentry); return NULL; } - atomic_set(&p->u.count, 1); - dname = p->name; + atomic_set(&ext->u.count, 1); + dname = ext->name; } else { dname = dentry->d_iname; } @@ -1683,6 +1697,12 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) } } + if (unlikely(ext)) { + pg_data_t *pgdat = page_pgdat(virt_to_page(ext)); + mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES, + ksize(ext)); + } + this_cpu_inc(nr_dentry); return dentry; @@ -2770,7 +2790,7 @@ static void copy_name(struct dentry *dentry, struct dentry *target) dentry->d_name.hash_len = target->d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) - kfree_rcu(old_name, u.head); + call_rcu(&old_name->u.head, __d_free_external_name); } /* diff --git a/fs/exec.c b/fs/exec.c index a919a827d181..183059c427b9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -257,7 +257,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * to work from. */ limit = _STK_LIM / 4 * 3; - limit = min(limit, rlimit(RLIMIT_STACK) / 4); + limit = min(limit, bprm->rlim_stack.rlim_cur / 4); if (size > limit) goto fail; } @@ -411,6 +411,11 @@ static int bprm_mm_init(struct linux_binprm *bprm) if (!mm) goto err; + /* Save current stack limit for all calculations made during exec. */ + task_lock(current->group_leader); + bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; + task_unlock(current->group_leader); + err = __bprm_mm_init(bprm); if (err) goto err; @@ -697,7 +702,7 @@ int setup_arg_pages(struct linux_binprm *bprm, #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ - stack_base = rlimit_max(RLIMIT_STACK); + stack_base = bprm->rlim_stack.rlim_max; if (stack_base > STACK_SIZE_MAX) stack_base = STACK_SIZE_MAX; @@ -770,7 +775,7 @@ int setup_arg_pages(struct linux_binprm *bprm, * Align this down to a page boundary as expand_stack * will align it up. */ - rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK; + rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; #ifdef CONFIG_STACK_GROWSUP if (stack_size + stack_expand > rlim_stack) stack_base = vma->vm_start + rlim_stack; @@ -1341,11 +1346,11 @@ void setup_new_exec(struct linux_binprm * bprm) * RLIMIT_STACK, but after the point of no return to avoid * needing to clean up the change on failure. */ - if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM) - current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM; + if (bprm->rlim_stack.rlim_cur > _STK_LIM) + bprm->rlim_stack.rlim_cur = _STK_LIM; } - arch_pick_mmap_layout(current->mm); + arch_pick_mmap_layout(current->mm, &bprm->rlim_stack); current->sas_ss_sp = current->sas_ss_size = 0; @@ -1378,6 +1383,16 @@ void setup_new_exec(struct linux_binprm * bprm) } EXPORT_SYMBOL(setup_new_exec); +/* Runs immediately before start_thread() takes over. */ +void finalize_exec(struct linux_binprm *bprm) +{ + /* Store any stack rlimit changes before starting thread. */ + task_lock(current->group_leader); + current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack; + task_unlock(current->group_leader); +} +EXPORT_SYMBOL(finalize_exec); + /* * Prepare credentials and lock ->cred_guard_mutex. * install_exec_creds() commits the new creds and drops the lock. diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index db50686f5096..02237d4d91f5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2424,12 +2424,12 @@ void f2fs_set_page_dirty_nobuffers(struct page *page) SetPageDirty(page); spin_unlock(&mapping->private_lock); - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); WARN_ON_ONCE(!PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); unlock_page_memcg(page); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index fe661274ff10..8c9c2f31b253 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -732,10 +732,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, page_index(page), + xa_lock_irqsave(&mapping->i_pages, flags); + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); clear_page_dirty_for_io(page); ClearPagePrivate(page); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index bfb7a4a3a929..9327411fd93b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1015,7 +1015,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), - .iroot = RADIX_TREE_INIT(GFP_NOFS), + .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; trace_f2fs_gc_begin(sbi->sb, sync, background, diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3b77d6421218..265da200daa8 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -226,10 +226,10 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, page_index(page), + xa_lock_irqsave(&mapping->i_pages, flags); + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9a99243054ba..f202398e20ea 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -91,11 +91,11 @@ static void clear_node_page_dirty(struct page *page) unsigned int long flags; if (PageDirty(page)) { - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, + xa_lock_irqsave(&mapping->i_pages, flags); + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); clear_page_dirty_for_io(page); dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); @@ -1161,7 +1161,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_bug_on(sbi, check_nid_range(sbi, nid)); rcu_read_lock(); - apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid); + apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid); rcu_read_unlock(); if (apage) return; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1280f915079b..4b12ba70a895 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -347,9 +347,9 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions * between unlocked_inode_to_wb_begin/end() are guaranteed to be - * synchronizing against mapping->tree_lock. + * synchronizing against the i_pages lock. * - * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock + * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock * gives us exclusion against all wb related operations on @inode * including IO list manipulations and stat updates. */ @@ -361,7 +361,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); } spin_lock(&inode->i_lock); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); /* * Once I_FREEING is visible under i_lock, the eviction path owns @@ -373,22 +373,22 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) /* * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to - * pages actually under underwriteback. + * pages actually under writeback. */ - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, PAGECACHE_TAG_DIRTY) { struct page *page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (likely(page) && PageDirty(page)) { dec_wb_stat(old_wb, WB_RECLAIMABLE); inc_wb_stat(new_wb, WB_RECLAIMABLE); } } - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, PAGECACHE_TAG_WRITEBACK) { struct page *page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (likely(page)) { WARN_ON_ONCE(!PageWriteback(page)); dec_wb_stat(old_wb, WB_WRITEBACK); @@ -430,7 +430,7 @@ skip_switch: */ smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); spin_unlock(&inode->i_lock); spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); @@ -506,8 +506,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) /* * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the mapping's - * tree_lock so that stat transfer can synchronize against them. + * the RCU protected stat update paths to grab the i_page + * lock so that stat transfer can synchronize against them. * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 7dc55b93a830..97137d7ec5ee 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -832,7 +832,7 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, /* Clear pointers back to the netfs */ cookie->netfs_data = NULL; cookie->def = NULL; - BUG_ON(cookie->stores.rnode); + BUG_ON(!radix_tree_empty(&cookie->stores)); if (cookie->parent) { ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0); diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 1085ca12e25c..20e0d0a4dc8c 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -973,7 +973,7 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj * retire the object instead. */ if (!fscache_use_cookie(object)) { - ASSERT(object->cookie->stores.rnode == NULL); + ASSERT(radix_tree_empty(&object->cookie->stores)); set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); _leave(" [no cookie]"); return transit_to(KILL_OBJECT); diff --git a/fs/inode.c b/fs/inode.c index b153aeaa61ea..13ceb98c3bd3 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -348,8 +348,7 @@ EXPORT_SYMBOL(inc_nlink); static void __address_space_init_once(struct address_space *mapping) { - INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC | __GFP_ACCOUNT); - spin_lock_init(&mapping->tree_lock); + INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC | __GFP_ACCOUNT); init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); @@ -504,14 +503,14 @@ EXPORT_SYMBOL(__remove_inode_hash); void clear_inode(struct inode *inode) { /* - * We have to cycle tree_lock here because reclaim can be still in the + * We have to cycle the i_pages lock here because reclaim can be in the * process of removing the last page (in __delete_from_page_cache()) - * and we must not free mapping under it. + * and we must not free the mapping under it. */ - spin_lock_irq(&inode->i_data.tree_lock); + xa_lock_irq(&inode->i_data.i_pages); BUG_ON(inode->i_data.nrpages); BUG_ON(inode->i_data.nrexceptional); - spin_unlock_irq(&inode->i_data.tree_lock); + xa_unlock_irq(&inode->i_data.i_pages); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index c21e0b4454a6..dec98cab729d 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -193,9 +193,9 @@ retry: (unsigned long long)oldkey, (unsigned long long)newkey); - spin_lock_irq(&btnc->tree_lock); - err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page); - spin_unlock_irq(&btnc->tree_lock); + xa_lock_irq(&btnc->i_pages); + err = radix_tree_insert(&btnc->i_pages, newkey, obh->b_page); + xa_unlock_irq(&btnc->i_pages); /* * Note: page->index will not change to newkey until * nilfs_btnode_commit_change_key() will be called. @@ -251,11 +251,11 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, (unsigned long long)newkey); mark_buffer_dirty(obh); - spin_lock_irq(&btnc->tree_lock); - radix_tree_delete(&btnc->page_tree, oldkey); - radix_tree_tag_set(&btnc->page_tree, newkey, + xa_lock_irq(&btnc->i_pages); + radix_tree_delete(&btnc->i_pages, oldkey); + radix_tree_tag_set(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&btnc->tree_lock); + xa_unlock_irq(&btnc->i_pages); opage->index = obh->b_blocknr = newkey; unlock_page(opage); @@ -283,9 +283,9 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc, return; if (nbh == NULL) { /* blocksize == pagesize */ - spin_lock_irq(&btnc->tree_lock); - radix_tree_delete(&btnc->page_tree, newkey); - spin_unlock_irq(&btnc->tree_lock); + xa_lock_irq(&btnc->i_pages); + radix_tree_delete(&btnc->i_pages, newkey); + xa_unlock_irq(&btnc->i_pages); unlock_page(ctxt->bh->b_page); } else brelse(nbh); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 68241512d7c1..4cb850a6f1c2 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -331,15 +331,15 @@ repeat: struct page *page2; /* move the page to the destination cache */ - spin_lock_irq(&smap->tree_lock); - page2 = radix_tree_delete(&smap->page_tree, offset); + xa_lock_irq(&smap->i_pages); + page2 = radix_tree_delete(&smap->i_pages, offset); WARN_ON(page2 != page); smap->nrpages--; - spin_unlock_irq(&smap->tree_lock); + xa_unlock_irq(&smap->i_pages); - spin_lock_irq(&dmap->tree_lock); - err = radix_tree_insert(&dmap->page_tree, offset, page); + xa_lock_irq(&dmap->i_pages); + err = radix_tree_insert(&dmap->i_pages, offset, page); if (unlikely(err < 0)) { WARN_ON(err == -EEXIST); page->mapping = NULL; @@ -348,11 +348,11 @@ repeat: page->mapping = dmap; dmap->nrpages++; if (PageDirty(page)) - radix_tree_tag_set(&dmap->page_tree, + radix_tree_tag_set(&dmap->i_pages, offset, PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&dmap->tree_lock); + xa_unlock_irq(&dmap->i_pages); } unlock_page(page); } @@ -474,15 +474,15 @@ int __nilfs_clear_page_dirty(struct page *page) struct address_space *mapping = page->mapping; if (mapping) { - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); if (test_bit(PG_dirty, &page->flags)) { - radix_tree_tag_clear(&mapping->page_tree, + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return clear_page_dirty_for_io(page); } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return 0; } return TestClearPageDirty(page); diff --git a/fs/proc/array.c b/fs/proc/array.c index 598803576e4c..ae2c807fd719 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -141,25 +141,12 @@ static inline const char *get_task_state(struct task_struct *tsk) return task_state_array[task_state_index(tsk)]; } -static inline int get_task_umask(struct task_struct *tsk) -{ - struct fs_struct *fs; - int umask = -ENOENT; - - task_lock(tsk); - fs = tsk->fs; - if (fs) - umask = fs->umask; - task_unlock(tsk); - return umask; -} - static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; - int g, umask; + int g, umask = -1; struct task_struct *tracer; const struct cred *cred; pid_t ppid, tpid = 0, tgid, ngid; @@ -177,17 +164,18 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, ngid = task_numa_group_id(p); cred = get_task_cred(p); - umask = get_task_umask(p); - if (umask >= 0) - seq_printf(m, "Umask:\t%#04o\n", umask); - task_lock(p); + if (p->fs) + umask = p->fs->umask; if (p->files) max_fds = files_fdtable(p->files)->max_fds; task_unlock(p); rcu_read_unlock(); - seq_printf(m, "State:\t%s", get_task_state(p)); + if (umask >= 0) + seq_printf(m, "Umask:\t%#04o\n", umask); + seq_puts(m, "State:\t"); + seq_puts(m, get_task_state(p)); seq_put_decimal_ull(m, "\nTgid:\t", tgid); seq_put_decimal_ull(m, "\nNgid:\t", ngid); @@ -313,8 +301,8 @@ static void render_cap_t(struct seq_file *m, const char *header, seq_puts(m, header); CAP_FOR_EACH_U32(__capi) { - seq_printf(m, "%08x", - a->cap[CAP_LAST_U32 - __capi]); + seq_put_hex_ll(m, NULL, + a->cap[CAP_LAST_U32 - __capi], 8); } seq_putc(m, '\n'); } @@ -368,7 +356,8 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) { - seq_printf(m, "CoreDumping:\t%d\n", !!mm->core_state); + seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state); + seq_putc(m, '\n'); } int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, @@ -504,7 +493,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, /* convert nsec -> ticks */ start_time = nsec_to_clock_t(task->real_start_time); - seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); + seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns)); + seq_puts(m, " ("); + seq_puts(m, tcomm); + seq_puts(m, ") "); + seq_putc(m, state); seq_put_decimal_ll(m, " ", ppid); seq_put_decimal_ll(m, " ", pgid); seq_put_decimal_ll(m, " ", sid); diff --git a/fs/proc/base.c b/fs/proc/base.c index d53246863cfb..eafa39a3a88c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -388,14 +388,17 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, unsigned long wchan; char symname[KSYM_NAME_LEN]; - wchan = get_wchan(task); + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + goto print0; - if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS) - && !lookup_symbol_name(wchan, symname)) - seq_printf(m, "%s", symname); - else - seq_putc(m, '0'); + wchan = get_wchan(task); + if (wchan && !lookup_symbol_name(wchan, symname)) { + seq_puts(m, symname); + return 0; + } +print0: + seq_putc(m, '0'); return 0; } #endif /* CONFIG_KALLSYMS */ @@ -1910,6 +1913,8 @@ static int dname_to_vma_addr(struct dentry *dentry, unsigned long long sval, eval; unsigned int len; + if (str[0] == '0' && str[1] != '-') + return -EINVAL; len = _parse_integer(str, 16, &sval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; @@ -1921,6 +1926,8 @@ static int dname_to_vma_addr(struct dentry *dentry, return -EINVAL; str++; + if (str[0] == '0' && str[1]) + return -EINVAL; len = _parse_integer(str, 16, &eval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; @@ -2204,6 +2211,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) } } up_read(&mm->mmap_sem); + mmput(mm); for (i = 0; i < nr_files; i++) { char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ @@ -2221,7 +2229,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) } if (fa) flex_array_free(fa); - mmput(mm); out_put_task: put_task_struct(task); diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index 403cbb12a6e9..8233e7af9389 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -6,7 +6,8 @@ static int cmdline_proc_show(struct seq_file *m, void *v) { - seq_printf(m, "%s\n", saved_command_line); + seq_puts(m, saved_command_line); + seq_putc(m, '\n'); return 0; } diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 5d709fa8f3a2..04c4804cbdef 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -8,6 +8,7 @@ * Copyright (C) 1997 Theodore Ts'o */ +#include <linux/cache.h> #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> @@ -28,6 +29,17 @@ static DEFINE_RWLOCK(proc_subdir_lock); +struct kmem_cache *proc_dir_entry_cache __ro_after_init; + +void pde_free(struct proc_dir_entry *pde) +{ + if (S_ISLNK(pde->mode)) + kfree(pde->data); + if (pde->name != pde->inline_name) + kfree(pde->name); + kmem_cache_free(proc_dir_entry_cache, pde); +} + static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len) { if (len < de->namelen) @@ -40,8 +52,8 @@ static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir) { - return rb_entry_safe(rb_first_cached(&dir->subdir), - struct proc_dir_entry, subdir_node); + return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry, + subdir_node); } static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir) @@ -54,7 +66,7 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, const char *name, unsigned int len) { - struct rb_node *node = dir->subdir.rb_root.rb_node; + struct rb_node *node = dir->subdir.rb_node; while (node) { struct proc_dir_entry *de = rb_entry(node, @@ -75,9 +87,8 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, static bool pde_subdir_insert(struct proc_dir_entry *dir, struct proc_dir_entry *de) { - struct rb_root_cached *root = &dir->subdir; - struct rb_node **new = &root->rb_root.rb_node, *parent = NULL; - bool leftmost = true; + struct rb_root *root = &dir->subdir; + struct rb_node **new = &root->rb_node, *parent = NULL; /* Figure out where to put new node */ while (*new) { @@ -89,16 +100,15 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, parent = *new; if (result < 0) new = &(*new)->rb_left; - else if (result > 0) { + else if (result > 0) new = &(*new)->rb_right; - leftmost = false; - } else + else return false; } /* Add new node and rebalance tree. */ rb_link_node(&de->subdir_node, parent, new); - rb_insert_color_cached(&de->subdir_node, root, leftmost); + rb_insert_color(&de->subdir_node, root); return true; } @@ -354,6 +364,14 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, WARN(1, "name len %u\n", qstr.len); return NULL; } + if (qstr.len == 1 && fn[0] == '.') { + WARN(1, "name '.'\n"); + return NULL; + } + if (qstr.len == 2 && fn[0] == '.' && fn[1] == '.') { + WARN(1, "name '..'\n"); + return NULL; + } if (*parent == &proc_root && name_to_int(&qstr) != ~0U) { WARN(1, "create '/proc/%s' by hand\n", qstr.name); return NULL; @@ -363,16 +381,26 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, return NULL; } - ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL); + ent = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!ent) goto out; + if (qstr.len + 1 <= sizeof(ent->inline_name)) { + ent->name = ent->inline_name; + } else { + ent->name = kmalloc(qstr.len + 1, GFP_KERNEL); + if (!ent->name) { + pde_free(ent); + return NULL; + } + } + memcpy(ent->name, fn, qstr.len + 1); ent->namelen = qstr.len; ent->mode = mode; ent->nlink = nlink; - ent->subdir = RB_ROOT_CACHED; - atomic_set(&ent->count, 1); + ent->subdir = RB_ROOT; + refcount_set(&ent->refcnt, 1); spin_lock_init(&ent->pde_unload_lock); INIT_LIST_HEAD(&ent->pde_openers); proc_set_user(ent, (*parent)->uid, (*parent)->gid); @@ -395,12 +423,11 @@ struct proc_dir_entry *proc_symlink(const char *name, strcpy((char*)ent->data,dest); ent->proc_iops = &proc_link_inode_operations; if (proc_register(parent, ent) < 0) { - kfree(ent->data); - kfree(ent); + pde_free(ent); ent = NULL; } } else { - kfree(ent); + pde_free(ent); ent = NULL; } } @@ -423,7 +450,7 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, ent->proc_iops = &proc_dir_inode_operations; parent->nlink++; if (proc_register(parent, ent) < 0) { - kfree(ent); + pde_free(ent); parent->nlink--; ent = NULL; } @@ -458,7 +485,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent->proc_iops = NULL; parent->nlink++; if (proc_register(parent, ent) < 0) { - kfree(ent); + pde_free(ent); parent->nlink--; ent = NULL; } @@ -495,7 +522,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, goto out_free; return pde; out_free: - kfree(pde); + pde_free(pde); out: return NULL; } @@ -522,19 +549,12 @@ void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) } EXPORT_SYMBOL(proc_set_user); -static void free_proc_entry(struct proc_dir_entry *de) -{ - proc_free_inum(de->low_ino); - - if (S_ISLNK(de->mode)) - kfree(de->data); - kfree(de); -} - void pde_put(struct proc_dir_entry *pde) { - if (atomic_dec_and_test(&pde->count)) - free_proc_entry(pde); + if (refcount_dec_and_test(&pde->refcnt)) { + proc_free_inum(pde->low_ino); + pde_free(pde); + } } /* @@ -555,7 +575,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) de = pde_subdir_find(parent, fn, len); if (de) - rb_erase_cached(&de->subdir_node, &parent->subdir); + rb_erase(&de->subdir_node, &parent->subdir); write_unlock(&proc_subdir_lock); if (!de) { WARN(1, "name '%s'\n", name); @@ -592,13 +612,13 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) write_unlock(&proc_subdir_lock); return -ENOENT; } - rb_erase_cached(&root->subdir_node, &parent->subdir); + rb_erase(&root->subdir_node, &parent->subdir); de = root; while (1) { next = pde_subdir_first(de); if (next) { - rb_erase_cached(&next->subdir_node, &de->subdir); + rb_erase(&next->subdir_node, &de->subdir); de = next; continue; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 6e8724958116..2cf3b74391ca 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -54,6 +54,7 @@ static void proc_evict_inode(struct inode *inode) } static struct kmem_cache *proc_inode_cachep __ro_after_init; +static struct kmem_cache *pde_opener_cache __ro_after_init; static struct inode *proc_alloc_inode(struct super_block *sb) { @@ -92,7 +93,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -void __init proc_init_inodecache(void) +void __init proc_init_kmemcache(void) { proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), @@ -100,6 +101,13 @@ void __init proc_init_inodecache(void) SLAB_MEM_SPREAD|SLAB_ACCOUNT| SLAB_PANIC), init_once); + pde_opener_cache = + kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0, + SLAB_ACCOUNT|SLAB_PANIC, NULL); + proc_dir_entry_cache = kmem_cache_create_usercopy( + "proc_dir_entry", sizeof(struct proc_dir_entry), 0, SLAB_PANIC, + offsetof(struct proc_dir_entry, inline_name), + sizeof_field(struct proc_dir_entry, inline_name), NULL); } static int proc_show_options(struct seq_file *seq, struct dentry *root) @@ -138,7 +146,7 @@ static void unuse_pde(struct proc_dir_entry *pde) complete(pde->pde_unload_completion); } -/* pde is locked */ +/* pde is locked on entry, unlocked on exit */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) { /* @@ -157,9 +165,10 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) pdeo->c = &c; spin_unlock(&pde->pde_unload_lock); wait_for_completion(&c); - spin_lock(&pde->pde_unload_lock); } else { struct file *file; + struct completion *c; + pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); file = pdeo->file; @@ -167,9 +176,11 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) spin_lock(&pde->pde_unload_lock); /* After ->release. */ list_del(&pdeo->lh); - if (unlikely(pdeo->c)) - complete(pdeo->c); - kfree(pdeo); + c = pdeo->c; + spin_unlock(&pde->pde_unload_lock); + if (unlikely(c)) + complete(c); + kmem_cache_free(pde_opener_cache, pdeo); } } @@ -188,6 +199,7 @@ void proc_entry_rundown(struct proc_dir_entry *de) struct pde_opener *pdeo; pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); close_pdeo(de, pdeo); + spin_lock(&de->pde_unload_lock); } spin_unlock(&de->pde_unload_lock); } @@ -338,31 +350,36 @@ static int proc_reg_open(struct inode *inode, struct file *file) * * Save every "struct file" with custom ->release hook. */ - pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); - if (!pdeo) - return -ENOMEM; - - if (!use_pde(pde)) { - kfree(pdeo); + if (!use_pde(pde)) return -ENOENT; - } - open = pde->proc_fops->open; + release = pde->proc_fops->release; + if (release) { + pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL); + if (!pdeo) { + rv = -ENOMEM; + goto out_unuse; + } + } + open = pde->proc_fops->open; if (open) rv = open(inode, file); - if (rv == 0 && release) { - /* To know what to release. */ - pdeo->file = file; - pdeo->closing = false; - pdeo->c = NULL; - spin_lock(&pde->pde_unload_lock); - list_add(&pdeo->lh, &pde->pde_openers); - spin_unlock(&pde->pde_unload_lock); - } else - kfree(pdeo); + if (release) { + if (rv == 0) { + /* To know what to release. */ + pdeo->file = file; + pdeo->closing = false; + pdeo->c = NULL; + spin_lock(&pde->pde_unload_lock); + list_add(&pdeo->lh, &pde->pde_openers); + spin_unlock(&pde->pde_unload_lock); + } else + kmem_cache_free(pde_opener_cache, pdeo); + } +out_unuse: unuse_pde(pde); return rv; } @@ -375,7 +392,7 @@ static int proc_reg_release(struct inode *inode, struct file *file) list_for_each_entry(pdeo, &pde->pde_openers, lh) { if (pdeo->file == file) { close_pdeo(pde, pdeo); - break; + return 0; } } spin_unlock(&pde->pde_unload_lock); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index d697c8ab0a14..0f1692e63cb6 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -11,6 +11,7 @@ #include <linux/proc_fs.h> #include <linux/proc_ns.h> +#include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/binfmts.h> @@ -36,7 +37,7 @@ struct proc_dir_entry { * negative -> it's going away RSN */ atomic_t in_use; - atomic_t count; /* use count */ + refcount_t refcnt; struct list_head pde_openers; /* who did ->open, but not ->release */ /* protects ->pde_openers and all struct pde_opener instances */ spinlock_t pde_unload_lock; @@ -50,13 +51,22 @@ struct proc_dir_entry { kgid_t gid; loff_t size; struct proc_dir_entry *parent; - struct rb_root_cached subdir; + struct rb_root subdir; struct rb_node subdir_node; + char *name; umode_t mode; u8 namelen; - char name[]; +#ifdef CONFIG_64BIT +#define SIZEOF_PDE_INLINE_NAME (192-139) +#else +#define SIZEOF_PDE_INLINE_NAME (128-87) +#endif + char inline_name[SIZEOF_PDE_INLINE_NAME]; } __randomize_layout; +extern struct kmem_cache *proc_dir_entry_cache; +void pde_free(struct proc_dir_entry *pde); + union proc_op { int (*proc_get_link)(struct dentry *, struct path *); int (*proc_show)(struct seq_file *m, @@ -159,7 +169,7 @@ int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry * static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) { - atomic_inc(&pde->count); + refcount_inc(&pde->refcnt); return pde; } extern void pde_put(struct proc_dir_entry *); @@ -177,12 +187,12 @@ struct pde_opener { struct list_head lh; bool closing; struct completion *c; -}; +} __randomize_layout; extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; -extern void proc_init_inodecache(void); +void proc_init_kmemcache(void); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern int proc_fill_super(struct super_block *, void *data, int flags); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 6bb20f864259..65a72ab57471 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -26,20 +26,7 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) { - char v[32]; - static const char blanks[7] = {' ', ' ', ' ', ' ',' ', ' ', ' '}; - int len; - - len = num_to_str(v, sizeof(v), num << (PAGE_SHIFT - 10)); - - seq_write(m, s, 16); - - if (len > 0) { - if (len < 8) - seq_write(m, blanks, 8 - len); - - seq_write(m, v, len); - } + seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); seq_write(m, " kB\n", 4); } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 68c06ae7888c..1763f370489d 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -192,15 +192,16 @@ static __net_init int proc_net_ns_init(struct net *net) int err; err = -ENOMEM; - netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL); + netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!netd) goto out; - netd->subdir = RB_ROOT_CACHED; + netd->subdir = RB_ROOT; netd->data = net; netd->nlink = 2; netd->namelen = 3; netd->parent = &proc_root; + netd->name = netd->inline_name; memcpy(netd->name, "net", 4); uid = make_kuid(net->user_ns, 0); @@ -223,7 +224,7 @@ static __net_init int proc_net_ns_init(struct net *net) return 0; free_net: - kfree(netd); + pde_free(netd); out: return err; } @@ -231,7 +232,7 @@ out: static __net_exit void proc_net_ns_exit(struct net *net) { remove_proc_entry("stat", net->proc_net); - kfree(net->proc_net); + pde_free(net->proc_net); } static struct pernet_operations __net_initdata proc_net_ns_ops = { diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index c41ab261397d..8989936f2995 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -707,14 +707,14 @@ static bool proc_sys_link_fill_cache(struct file *file, struct ctl_table *table) { bool ret = true; + head = sysctl_head_grab(head); + if (IS_ERR(head)) + return false; - if (S_ISLNK(table->mode)) { - /* It is not an error if we can not follow the link ignore it */ - int err = sysctl_follow_link(&head, &table); - if (err) - goto out; - } + /* It is not an error if we can not follow the link ignore it */ + if (sysctl_follow_link(&head, &table)) + goto out; ret = proc_sys_fill_cache(file, ctx, head, table); out: @@ -1086,7 +1086,7 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table) if ((table->proc_handler == proc_douintvec) || (table->proc_handler == proc_douintvec_minmax)) { if (table->maxlen != sizeof(unsigned int)) - err |= sysctl_err(path, table, "array now allowed"); + err |= sysctl_err(path, table, "array not allowed"); } return err; diff --git a/fs/proc/root.c b/fs/proc/root.c index ede8e64974be..61b7340b357a 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -123,23 +123,13 @@ static struct file_system_type proc_fs_type = { void __init proc_root_init(void) { - int err; - - proc_init_inodecache(); + proc_init_kmemcache(); set_proc_pid_nlink(); - err = register_filesystem(&proc_fs_type); - if (err) - return; - proc_self_init(); proc_thread_self_init(); proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); - -#ifdef CONFIG_SYSVIPC - proc_mkdir("sysvipc", NULL); -#endif proc_mkdir("fs", NULL); proc_mkdir("driver", NULL); proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */ @@ -150,6 +140,8 @@ void __init proc_root_init(void) proc_tty_init(); proc_mkdir("bus", NULL); proc_sys_init(); + + register_filesystem(&proc_fs_type); } static int proc_root_getattr(const struct path *path, struct kstat *stat, @@ -207,12 +199,13 @@ struct proc_dir_entry proc_root = { .namelen = 5, .mode = S_IFDIR | S_IRUGO | S_IXUGO, .nlink = 2, - .count = ATOMIC_INIT(1), + .refcnt = REFCOUNT_INIT(1), .proc_iops = &proc_root_inode_operations, .proc_fops = &proc_root_operations, .parent = &proc_root, - .subdir = RB_ROOT_CACHED, - .name = "/proc", + .subdir = RB_ROOT, + .name = proc_root.inline_name, + .inline_name = "/proc", }; int pid_ns_prepare_proc(struct pid_namespace *ns) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ec6d2983a5cb..65ae54659833 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -24,6 +24,8 @@ #include <asm/tlbflush.h> #include "internal.h" +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) void task_mem(struct seq_file *m, struct mm_struct *mm) { unsigned long text, lib, swap, anon, file, shmem; @@ -53,39 +55,28 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) lib = (mm->exec_vm << PAGE_SHIFT) - text; swap = get_mm_counter(mm, MM_SWAPENTS); - seq_printf(m, - "VmPeak:\t%8lu kB\n" - "VmSize:\t%8lu kB\n" - "VmLck:\t%8lu kB\n" - "VmPin:\t%8lu kB\n" - "VmHWM:\t%8lu kB\n" - "VmRSS:\t%8lu kB\n" - "RssAnon:\t%8lu kB\n" - "RssFile:\t%8lu kB\n" - "RssShmem:\t%8lu kB\n" - "VmData:\t%8lu kB\n" - "VmStk:\t%8lu kB\n" - "VmExe:\t%8lu kB\n" - "VmLib:\t%8lu kB\n" - "VmPTE:\t%8lu kB\n" - "VmSwap:\t%8lu kB\n", - hiwater_vm << (PAGE_SHIFT-10), - total_vm << (PAGE_SHIFT-10), - mm->locked_vm << (PAGE_SHIFT-10), - mm->pinned_vm << (PAGE_SHIFT-10), - hiwater_rss << (PAGE_SHIFT-10), - total_rss << (PAGE_SHIFT-10), - anon << (PAGE_SHIFT-10), - file << (PAGE_SHIFT-10), - shmem << (PAGE_SHIFT-10), - mm->data_vm << (PAGE_SHIFT-10), - mm->stack_vm << (PAGE_SHIFT-10), - text >> 10, - lib >> 10, - mm_pgtables_bytes(mm) >> 10, - swap << (PAGE_SHIFT-10)); + SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); + SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); + SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); + SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm); + SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); + SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); + SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); + SEQ_PUT_DEC(" kB\nRssFile:\t", file); + SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem); + SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm); + SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm); + seq_put_decimal_ull_width(m, + " kB\nVmExe:\t", text >> 10, 8); + seq_put_decimal_ull_width(m, + " kB\nVmLib:\t", lib >> 10, 8); + seq_put_decimal_ull_width(m, + " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8); + SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); + seq_puts(m, " kB\n"); hugetlb_report_usage(m, mm); } +#undef SEQ_PUT_DEC unsigned long task_vsize(struct mm_struct *mm) { @@ -287,15 +278,18 @@ static void show_vma_header_prefix(struct seq_file *m, dev_t dev, unsigned long ino) { seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); - seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", - start, - end, - flags & VM_READ ? 'r' : '-', - flags & VM_WRITE ? 'w' : '-', - flags & VM_EXEC ? 'x' : '-', - flags & VM_MAYSHARE ? 's' : 'p', - pgoff, - MAJOR(dev), MINOR(dev), ino); + seq_put_hex_ll(m, NULL, start, 8); + seq_put_hex_ll(m, "-", end, 8); + seq_putc(m, ' '); + seq_putc(m, flags & VM_READ ? 'r' : '-'); + seq_putc(m, flags & VM_WRITE ? 'w' : '-'); + seq_putc(m, flags & VM_EXEC ? 'x' : '-'); + seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p'); + seq_put_hex_ll(m, " ", pgoff, 8); + seq_put_hex_ll(m, " ", MAJOR(dev), 2); + seq_put_hex_ll(m, ":", MINOR(dev), 2); + seq_put_decimal_ull(m, " ", ino); + seq_putc(m, ' '); } static void @@ -694,8 +688,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) if (!mnemonics[i][0]) continue; if (vma->vm_flags & (1UL << i)) { - seq_printf(m, "%c%c ", - mnemonics[i][0], mnemonics[i][1]); + seq_putc(m, mnemonics[i][0]); + seq_putc(m, mnemonics[i][1]); + seq_putc(m, ' '); } } seq_putc(m, '\n'); @@ -736,6 +731,8 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma) { } +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) >> 10, 8) static int show_smap(struct seq_file *m, void *v, int is_pid) { struct proc_maps_private *priv = m->private; @@ -809,51 +806,34 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) ret = SEQ_SKIP; } - if (!rollup_mode) - seq_printf(m, - "Size: %8lu kB\n" - "KernelPageSize: %8lu kB\n" - "MMUPageSize: %8lu kB\n", - (vma->vm_end - vma->vm_start) >> 10, - vma_kernel_pagesize(vma) >> 10, - vma_mmu_pagesize(vma) >> 10); - - - if (!rollup_mode || last_vma) - seq_printf(m, - "Rss: %8lu kB\n" - "Pss: %8lu kB\n" - "Shared_Clean: %8lu kB\n" - "Shared_Dirty: %8lu kB\n" - "Private_Clean: %8lu kB\n" - "Private_Dirty: %8lu kB\n" - "Referenced: %8lu kB\n" - "Anonymous: %8lu kB\n" - "LazyFree: %8lu kB\n" - "AnonHugePages: %8lu kB\n" - "ShmemPmdMapped: %8lu kB\n" - "Shared_Hugetlb: %8lu kB\n" - "Private_Hugetlb: %7lu kB\n" - "Swap: %8lu kB\n" - "SwapPss: %8lu kB\n" - "Locked: %8lu kB\n", - mss->resident >> 10, - (unsigned long)(mss->pss >> (10 + PSS_SHIFT)), - mss->shared_clean >> 10, - mss->shared_dirty >> 10, - mss->private_clean >> 10, - mss->private_dirty >> 10, - mss->referenced >> 10, - mss->anonymous >> 10, - mss->lazyfree >> 10, - mss->anonymous_thp >> 10, - mss->shmem_thp >> 10, - mss->shared_hugetlb >> 10, - mss->private_hugetlb >> 10, - mss->swap >> 10, - (unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)), - (unsigned long)(mss->pss >> (10 + PSS_SHIFT))); + if (!rollup_mode) { + SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); + SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); + SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); + seq_puts(m, " kB\n"); + } + if (!rollup_mode || last_vma) { + SEQ_PUT_DEC("Rss: ", mss->resident); + SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); + SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); + SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); + SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); + SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); + SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); + SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); + SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); + SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); + SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); + seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", + mss->private_hugetlb >> 10, 7); + SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); + SEQ_PUT_DEC(" kB\nSwapPss: ", + mss->swap_pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nLocked: ", mss->pss >> PSS_SHIFT); + seq_puts(m, " kB\n"); + } if (!rollup_mode) { arch_show_smap(m, vma); show_smap_vma_flags(m, vma); @@ -861,6 +841,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) m_cache_vma(m, vma); return ret; } +#undef SEQ_PUT_DEC static int show_pid_smap(struct seq_file *m, void *v) { diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 70057359fbaf..23148c3ed675 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2643,7 +2643,7 @@ static int journal_init_dev(struct super_block *super, if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; - reiserfs_warning(super, + reiserfs_warning(super, "sh-457", "journal_init_dev: Cannot open '%s': %i", jdev_name, result); return result; diff --git a/fs/seq_file.c b/fs/seq_file.c index eea09f6d8830..c6c27f1f9c98 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -6,6 +6,7 @@ * initial implementation -- AV, Oct 2001. */ +#include <linux/cache.h> #include <linux/fs.h> #include <linux/export.h> #include <linux/seq_file.h> @@ -19,6 +20,8 @@ #include <linux/uaccess.h> #include <asm/page.h> +static struct kmem_cache *seq_file_cache __ro_after_init; + static void seq_set_overflow(struct seq_file *m) { m->count = m->size; @@ -26,7 +29,7 @@ static void seq_set_overflow(struct seq_file *m) static void *seq_buf_alloc(unsigned long size) { - return kvmalloc(size, GFP_KERNEL); + return kvmalloc(size, GFP_KERNEL_ACCOUNT); } /** @@ -51,7 +54,7 @@ int seq_open(struct file *file, const struct seq_operations *op) WARN_ON(file->private_data); - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL); if (!p) return -ENOMEM; @@ -366,7 +369,7 @@ int seq_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; kvfree(m->buf); - kfree(m); + kmem_cache_free(seq_file_cache, m); return 0; } EXPORT_SYMBOL(seq_release); @@ -563,7 +566,7 @@ static void single_stop(struct seq_file *p, void *v) int single_open(struct file *file, int (*show)(struct seq_file *, void *), void *data) { - struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL); + struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_ACCOUNT); int res = -ENOMEM; if (op) { @@ -625,7 +628,7 @@ void *__seq_open_private(struct file *f, const struct seq_operations *ops, void *private; struct seq_file *seq; - private = kzalloc(psize, GFP_KERNEL); + private = kzalloc(psize, GFP_KERNEL_ACCOUNT); if (private == NULL) goto out; @@ -673,29 +676,37 @@ void seq_puts(struct seq_file *m, const char *s) } EXPORT_SYMBOL(seq_puts); -/* +/** * A helper routine for putting decimal numbers without rich format of printf(). * only 'unsigned long long' is supported. - * This routine will put strlen(delimiter) + number into seq_file. + * @m: seq_file identifying the buffer to which data should be written + * @delimiter: a string which is printed before the number + * @num: the number + * @width: a minimum field width + * + * This routine will put strlen(delimiter) + number into seq_filed. * This routine is very quick when you show lots of numbers. * In usual cases, it will be better to use seq_printf(). It's easier to read. */ -void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, - unsigned long long num) +void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, + unsigned long long num, unsigned int width) { int len; if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */ goto overflow; - len = strlen(delimiter); - if (m->count + len >= m->size) - goto overflow; + if (delimiter && delimiter[0]) { + if (delimiter[1] == 0) + seq_putc(m, delimiter[0]); + else + seq_puts(m, delimiter); + } - memcpy(m->buf + m->count, delimiter, len); - m->count += len; + if (!width) + width = 1; - if (m->count + 1 >= m->size) + if (m->count + width >= m->size) goto overflow; if (num < 10) { @@ -703,7 +714,7 @@ void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, return; } - len = num_to_str(m->buf + m->count, m->size - m->count, num); + len = num_to_str(m->buf + m->count, m->size - m->count, num, width); if (!len) goto overflow; @@ -713,8 +724,60 @@ void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, overflow: seq_set_overflow(m); } + +void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, + unsigned long long num) +{ + return seq_put_decimal_ull_width(m, delimiter, num, 0); +} EXPORT_SYMBOL(seq_put_decimal_ull); +/** + * seq_put_hex_ll - put a number in hexadecimal notation + * @m: seq_file identifying the buffer to which data should be written + * @delimiter: a string which is printed before the number + * @v: the number + * @width: a minimum field width + * + * seq_put_hex_ll(m, "", v, 8) is equal to seq_printf(m, "%08llx", v) + * + * This routine is very quick when you show lots of numbers. + * In usual cases, it will be better to use seq_printf(). It's easier to read. + */ +void seq_put_hex_ll(struct seq_file *m, const char *delimiter, + unsigned long long v, unsigned int width) +{ + unsigned int len; + int i; + + if (delimiter && delimiter[0]) { + if (delimiter[1] == 0) + seq_putc(m, delimiter[0]); + else + seq_puts(m, delimiter); + } + + /* If x is 0, the result of __builtin_clzll is undefined */ + if (v == 0) + len = 1; + else + len = (sizeof(v) * 8 - __builtin_clzll(v) + 3) / 4; + + if (len < width) + len = width; + + if (m->count + len > m->size) { + seq_set_overflow(m); + return; + } + + for (i = len - 1; i >= 0; i--) { + m->buf[m->count + i] = hex_asc[0xf & v]; + v = v >> 4; + } + m->count += len; +} + void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num) { int len; @@ -722,12 +785,12 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */ goto overflow; - len = strlen(delimiter); - if (m->count + len >= m->size) - goto overflow; - - memcpy(m->buf + m->count, delimiter, len); - m->count += len; + if (delimiter && delimiter[0]) { + if (delimiter[1] == 0) + seq_putc(m, delimiter[0]); + else + seq_puts(m, delimiter); + } if (m->count + 2 >= m->size) goto overflow; @@ -742,7 +805,7 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num return; } - len = num_to_str(m->buf + m->count, m->size - m->count, num); + len = num_to_str(m->buf + m->count, m->size - m->count, num, 0); if (!len) goto overflow; @@ -782,8 +845,14 @@ EXPORT_SYMBOL(seq_write); void seq_pad(struct seq_file *m, char c) { int size = m->pad_until - m->count; - if (size > 0) - seq_printf(m, "%*s", size, ""); + if (size > 0) { + if (size + m->count > m->size) { + seq_set_overflow(m); + return; + } + memset(m->buf + m->count, ' ', size); + m->count += size; + } if (c) seq_putc(m, c); } @@ -1040,3 +1109,8 @@ seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, return NULL; } EXPORT_SYMBOL(seq_hlist_next_percpu); + +void __init seq_file_init(void) +{ + seq_file_cache = KMEM_CACHE(seq_file, SLAB_ACCOUNT|SLAB_PANIC); +} diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 436a1de3fcdf..0ab824f574ed 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1467,19 +1467,8 @@ xfs_vm_set_page_dirty( newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); - if (newly_dirty) { - /* sigh - __set_page_dirty() is static, so copy it here, too */ - unsigned long flags; - - spin_lock_irqsave(&mapping->tree_lock, flags); - if (page->mapping) { /* Race with truncate? */ - WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - } - spin_unlock_irqrestore(&mapping->tree_lock, flags); - } + if (newly_dirty) + __set_page_dirty(page, mapping, 1); unlock_page_memcg(page); if (newly_dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 3e4ce54d84ab..09da0f124699 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -175,7 +175,7 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) } long congestion_wait(int sync, long timeout); -long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout); +long wait_iff_congested(int sync, long timeout); static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi) { @@ -329,7 +329,7 @@ static inline bool inode_to_wb_is_valid(struct inode *inode) * @inode: inode of interest * * Returns the wb @inode is currently associated with. The caller must be - * holding either @inode->i_lock, @inode->i_mapping->tree_lock, or the + * holding either @inode->i_lock, the i_pages lock, or the * associated wb's list_lock. */ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) @@ -337,7 +337,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&inode->i_lock) && - !lockdep_is_held(&inode->i_mapping->tree_lock) && + !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) && !lockdep_is_held(&inode->i_wb->list_lock))); #endif return inode->i_wb; @@ -349,7 +349,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) * @lockedp: temp bool output param, to be passed to the end function * * The caller wants to access the wb associated with @inode but isn't - * holding inode->i_lock, mapping->tree_lock or wb->list_lock. This + * holding inode->i_lock, the i_pages lock or wb->list_lock. This * function determines the wb associated with @inode and ensures that the * association doesn't change until the transaction is finished with * unlocked_inode_to_wb_end(). @@ -370,11 +370,11 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; if (unlikely(*lockedp)) - spin_lock_irq(&inode->i_mapping->tree_lock); + xa_lock_irq(&inode->i_mapping->i_pages); /* - * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock. - * inode_to_wb() will bark. Deref directly. + * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages + * lock. inode_to_wb() will bark. Deref directly. */ return inode->i_wb; } @@ -387,7 +387,7 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) { if (unlikely(locked)) - spin_unlock_irq(&inode->i_mapping->tree_lock); + xa_unlock_irq(&inode->i_mapping->i_pages); rcu_read_unlock(); } diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index b0abe21d6cc9..4955e0863b83 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -61,6 +61,8 @@ struct linux_binprm { unsigned interp_flags; unsigned interp_data; unsigned long loader, exec; + + struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */ } __randomize_layout; #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 @@ -118,6 +120,7 @@ extern int __must_check remove_arg_zero(struct linux_binprm *); extern int search_binary_handler(struct linux_binprm *); extern int flush_old_exec(struct linux_binprm * bprm); extern void setup_new_exec(struct linux_binprm * bprm); +extern void finalize_exec(struct linux_binprm *bprm); extern void would_dump(struct linux_binprm *, struct file *); extern int suid_dumpable; diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index d3f264a5b04d..ceb96ecab96e 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -17,9 +17,6 @@ */ #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) -#define randomized_struct_fields_start struct { -#define randomized_struct_fields_end }; - /* all clang versions usable with the kernel support KASAN ABI version 5 */ #define KASAN_ABI_VERSION 5 diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index e2c7f4369eff..b4bf73f5e38f 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -242,6 +242,9 @@ #if defined(RANDSTRUCT_PLUGIN) && !defined(__CHECKER__) #define __randomize_layout __attribute__((randomize_layout)) #define __no_randomize_layout __attribute__((no_randomize_layout)) +/* This anon struct can add padding, so only enable it under randstruct. */ +#define randomized_struct_fields_start struct { +#define randomized_struct_fields_end } __randomize_layout; #endif #endif /* GCC_VERSION >= 40500 */ @@ -256,15 +259,6 @@ */ #define __visible __attribute__((externally_visible)) -/* - * RANDSTRUCT_PLUGIN wants to use an anonymous struct, but it is only - * possible since GCC 4.6. To provide as much build testing coverage - * as possible, this is used for all GCC 4.6+ builds, and not just on - * RANDSTRUCT_PLUGIN builds. - */ -#define randomized_struct_fields_start struct { -#define randomized_struct_fields_end } __randomize_layout; - #endif /* GCC_VERSION >= 40600 */ diff --git a/include/linux/const.h b/include/linux/const.h new file mode 100644 index 000000000000..7b55a55f5911 --- /dev/null +++ b/include/linux/const.h @@ -0,0 +1,9 @@ +#ifndef _LINUX_CONST_H +#define _LINUX_CONST_H + +#include <uapi/linux/const.h> + +#define UL(x) (_UL(x)) +#define ULL(x) (_ULL(x)) + +#endif /* _LINUX_CONST_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 2aa02cad94d4..92efaf1f8977 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -13,6 +13,7 @@ #include <linux/list_lru.h> #include <linux/llist.h> #include <linux/radix-tree.h> +#include <linux/xarray.h> #include <linux/rbtree.h> #include <linux/init.h> #include <linux/pid.h> @@ -390,12 +391,11 @@ int pagecache_write_end(struct file *, struct address_space *mapping, struct address_space { struct inode *host; /* owner: inode, block_device */ - struct radix_tree_root page_tree; /* radix tree of all pages */ - spinlock_t tree_lock; /* and lock protecting it */ + struct radix_tree_root i_pages; /* cached pages */ atomic_t i_mmap_writable;/* count VM_SHARED mappings */ struct rb_root_cached i_mmap; /* tree of private and shared mappings */ struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ - /* Protected by tree_lock together with the radix tree */ + /* Protected by the i_pages lock */ unsigned long nrpages; /* number of total pages */ /* number of shadow or DAX exceptional entries */ unsigned long nrexceptional; @@ -1989,7 +1989,7 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) * * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to * synchronize competing switching instances and to tell - * wb stat updates to grab mapping->tree_lock. See + * wb stat updates to grab the i_pages lock. See * inode_switch_wb_work_fn() for details. * * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 325017ad9311..39988924de3a 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -80,76 +80,145 @@ struct hmm; /* - * hmm_pfn_t - HMM uses its own pfn type to keep several flags per page + * hmm_pfn_flag_e - HMM flag enums * * Flags: - * HMM_PFN_VALID: pfn is valid - * HMM_PFN_READ: CPU page table has read permission set + * HMM_PFN_VALID: pfn is valid. It has, at least, read permission. * HMM_PFN_WRITE: CPU page table has write permission set + * HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE) + * + * The driver provide a flags array, if driver valid bit for an entry is bit + * 3 ie (entry & (1 << 3)) is true if entry is valid then driver must provide + * an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3. + * Same logic apply to all flags. This is same idea as vm_page_prot in vma + * except that this is per device driver rather than per architecture. + */ +enum hmm_pfn_flag_e { + HMM_PFN_VALID = 0, + HMM_PFN_WRITE, + HMM_PFN_DEVICE_PRIVATE, + HMM_PFN_FLAG_MAX +}; + +/* + * hmm_pfn_value_e - HMM pfn special value + * + * Flags: * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory - * HMM_PFN_EMPTY: corresponding CPU page table entry is pte_none() + * HMM_PFN_NONE: corresponding CPU page table entry is pte_none() * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not * be mirrored by a device, because the entry will never have HMM_PFN_VALID * set and the pfn value is undefined. - * HMM_PFN_DEVICE_UNADDRESSABLE: unaddressable device memory (ZONE_DEVICE) + * + * Driver provide entry value for none entry, error entry and special entry, + * driver can alias (ie use same value for error and special for instance). It + * should not alias none and error or special. + * + * HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be: + * hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous, + * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table + * hmm_range.values[HMM_PFN_SPECIAL] if CPU page table entry is a special one */ -typedef unsigned long hmm_pfn_t; +enum hmm_pfn_value_e { + HMM_PFN_ERROR, + HMM_PFN_NONE, + HMM_PFN_SPECIAL, + HMM_PFN_VALUE_MAX +}; -#define HMM_PFN_VALID (1 << 0) -#define HMM_PFN_READ (1 << 1) -#define HMM_PFN_WRITE (1 << 2) -#define HMM_PFN_ERROR (1 << 3) -#define HMM_PFN_EMPTY (1 << 4) -#define HMM_PFN_SPECIAL (1 << 5) -#define HMM_PFN_DEVICE_UNADDRESSABLE (1 << 6) -#define HMM_PFN_SHIFT 7 +/* + * struct hmm_range - track invalidation lock on virtual address range + * + * @vma: the vm area struct for the range + * @list: all range lock are on a list + * @start: range virtual start address (inclusive) + * @end: range virtual end address (exclusive) + * @pfns: array of pfns (big enough for the range) + * @flags: pfn flags to match device driver page table + * @values: pfn value for some special case (none, special, error, ...) + * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) + * @valid: pfns array did not change since it has been fill by an HMM function + */ +struct hmm_range { + struct vm_area_struct *vma; + struct list_head list; + unsigned long start; + unsigned long end; + uint64_t *pfns; + const uint64_t *flags; + const uint64_t *values; + uint8_t pfn_shift; + bool valid; +}; /* - * hmm_pfn_t_to_page() - return struct page pointed to by a valid hmm_pfn_t - * @pfn: hmm_pfn_t to convert to struct page - * Returns: struct page pointer if pfn is a valid hmm_pfn_t, NULL otherwise + * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn + * @range: range use to decode HMM pfn value + * @pfn: HMM pfn value to get corresponding struct page from + * Returns: struct page pointer if pfn is a valid HMM pfn, NULL otherwise * - * If the hmm_pfn_t is valid (ie valid flag set) then return the struct page - * matching the pfn value stored in the hmm_pfn_t. Otherwise return NULL. + * If the HMM pfn is valid (ie valid flag set) then return the struct page + * matching the pfn value stored in the HMM pfn. Otherwise return NULL. */ -static inline struct page *hmm_pfn_t_to_page(hmm_pfn_t pfn) +static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, + uint64_t pfn) { - if (!(pfn & HMM_PFN_VALID)) + if (pfn == range->values[HMM_PFN_NONE]) + return NULL; + if (pfn == range->values[HMM_PFN_ERROR]) return NULL; - return pfn_to_page(pfn >> HMM_PFN_SHIFT); + if (pfn == range->values[HMM_PFN_SPECIAL]) + return NULL; + if (!(pfn & range->flags[HMM_PFN_VALID])) + return NULL; + return pfn_to_page(pfn >> range->pfn_shift); } /* - * hmm_pfn_t_to_pfn() - return pfn value store in a hmm_pfn_t - * @pfn: hmm_pfn_t to extract pfn from - * Returns: pfn value if hmm_pfn_t is valid, -1UL otherwise + * hmm_pfn_to_pfn() - return pfn value store in a HMM pfn + * @range: range use to decode HMM pfn value + * @pfn: HMM pfn value to extract pfn from + * Returns: pfn value if HMM pfn is valid, -1UL otherwise */ -static inline unsigned long hmm_pfn_t_to_pfn(hmm_pfn_t pfn) +static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, + uint64_t pfn) { - if (!(pfn & HMM_PFN_VALID)) + if (pfn == range->values[HMM_PFN_NONE]) + return -1UL; + if (pfn == range->values[HMM_PFN_ERROR]) + return -1UL; + if (pfn == range->values[HMM_PFN_SPECIAL]) + return -1UL; + if (!(pfn & range->flags[HMM_PFN_VALID])) return -1UL; - return (pfn >> HMM_PFN_SHIFT); + return (pfn >> range->pfn_shift); } /* - * hmm_pfn_t_from_page() - create a valid hmm_pfn_t value from struct page - * @page: struct page pointer for which to create the hmm_pfn_t - * Returns: valid hmm_pfn_t for the page + * hmm_pfn_from_page() - create a valid HMM pfn value from struct page + * @range: range use to encode HMM pfn value + * @page: struct page pointer for which to create the HMM pfn + * Returns: valid HMM pfn for the page */ -static inline hmm_pfn_t hmm_pfn_t_from_page(struct page *page) +static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range, + struct page *page) { - return (page_to_pfn(page) << HMM_PFN_SHIFT) | HMM_PFN_VALID; + return (page_to_pfn(page) << range->pfn_shift) | + range->flags[HMM_PFN_VALID]; } /* - * hmm_pfn_t_from_pfn() - create a valid hmm_pfn_t value from pfn - * @pfn: pfn value for which to create the hmm_pfn_t - * Returns: valid hmm_pfn_t for the pfn + * hmm_pfn_from_pfn() - create a valid HMM pfn value from pfn + * @range: range use to encode HMM pfn value + * @pfn: pfn value for which to create the HMM pfn + * Returns: valid HMM pfn for the pfn */ -static inline hmm_pfn_t hmm_pfn_t_from_pfn(unsigned long pfn) +static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, + unsigned long pfn) { - return (pfn << HMM_PFN_SHIFT) | HMM_PFN_VALID; + return (pfn << range->pfn_shift) | + range->flags[HMM_PFN_VALID]; } @@ -218,6 +287,16 @@ enum hmm_update_type { * @update: callback to update range on a device */ struct hmm_mirror_ops { + /* release() - release hmm_mirror + * + * @mirror: pointer to struct hmm_mirror + * + * This is called when the mm_struct is being released. + * The callback should make sure no references to the mirror occur + * after the callback returns. + */ + void (*release)(struct hmm_mirror *mirror); + /* sync_cpu_device_pagetables() - synchronize page tables * * @mirror: pointer to struct hmm_mirror @@ -262,23 +341,6 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); /* - * struct hmm_range - track invalidation lock on virtual address range - * - * @list: all range lock are on a list - * @start: range virtual start address (inclusive) - * @end: range virtual end address (exclusive) - * @pfns: array of pfns (big enough for the range) - * @valid: pfns array did not change since it has been fill by an HMM function - */ -struct hmm_range { - struct list_head list; - unsigned long start; - unsigned long end; - hmm_pfn_t *pfns; - bool valid; -}; - -/* * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device * driver lock that serializes device page table updates, then call * hmm_vma_range_done(), to check if the snapshot is still valid. The same @@ -291,17 +353,13 @@ struct hmm_range { * * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID ! */ -int hmm_vma_get_pfns(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns); -bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range); +int hmm_vma_get_pfns(struct hmm_range *range); +bool hmm_vma_range_done(struct hmm_range *range); /* * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will - * not migrate any device memory back to system memory. The hmm_pfn_t array will + * not migrate any device memory back to system memory. The HMM pfn array will * be updated with the fault result and current snapshot of the CPU page table * for the range. * @@ -310,22 +368,26 @@ bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range); * function returns -EAGAIN. * * Return value does not reflect if the fault was successful for every single - * address or not. Therefore, the caller must to inspect the hmm_pfn_t array to + * address or not. Therefore, the caller must to inspect the HMM pfn array to * determine fault status for each address. * * Trying to fault inside an invalid vma will result in -EINVAL. * * See the function description in mm/hmm.c for further documentation. */ -int hmm_vma_fault(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns, - bool write, - bool block); -#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ +int hmm_vma_fault(struct hmm_range *range, bool block); +/* Below are for HMM internal use only! Not to be used by device driver! */ +void hmm_mm_destroy(struct mm_struct *mm); + +static inline void hmm_mm_init(struct mm_struct *mm) +{ + mm->hmm = NULL; +} +#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */ +static inline void hmm_mm_destroy(struct mm_struct *mm) {} +static inline void hmm_mm_init(struct mm_struct *mm) {} +#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) struct hmm_devmem; @@ -498,23 +560,9 @@ struct hmm_device { struct hmm_device *hmm_device_new(void *drvdata); void hmm_device_put(struct hmm_device *hmm_device); #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ -#endif /* IS_ENABLED(CONFIG_HMM) */ - -/* Below are for HMM internal use only! Not to be used by device driver! */ -#if IS_ENABLED(CONFIG_HMM_MIRROR) -void hmm_mm_destroy(struct mm_struct *mm); - -static inline void hmm_mm_init(struct mm_struct *mm) -{ - mm->hmm = NULL; -} -#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -static inline void hmm_mm_destroy(struct mm_struct *mm) {} -static inline void hmm_mm_init(struct mm_struct *mm) {} -#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ - - #else /* IS_ENABLED(CONFIG_HMM) */ static inline void hmm_mm_destroy(struct mm_struct *mm) {} static inline void hmm_mm_init(struct mm_struct *mm) {} +#endif /* IS_ENABLED(CONFIG_HMM) */ + #endif /* LINUX_HMM_H */ diff --git a/include/linux/idr.h b/include/linux/idr.h index 7d6a6313f0ab..e856f4e0ab35 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -29,29 +29,31 @@ struct idr { #define IDR_FREE 0 /* Set the IDR flag and the IDR_FREE tag */ -#define IDR_RT_MARKER ((__force gfp_t)(3 << __GFP_BITS_SHIFT)) +#define IDR_RT_MARKER (ROOT_IS_IDR | (__force gfp_t) \ + (1 << (ROOT_TAG_SHIFT + IDR_FREE))) -#define IDR_INIT_BASE(base) { \ - .idr_rt = RADIX_TREE_INIT(IDR_RT_MARKER), \ +#define IDR_INIT_BASE(name, base) { \ + .idr_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER), \ .idr_base = (base), \ .idr_next = 0, \ } /** * IDR_INIT() - Initialise an IDR. + * @name: Name of IDR. * * A freshly-initialised IDR contains no IDs. */ -#define IDR_INIT IDR_INIT_BASE(0) +#define IDR_INIT(name) IDR_INIT_BASE(name, 0) /** - * DEFINE_IDR() - Define a statically-allocated IDR - * @name: Name of IDR + * DEFINE_IDR() - Define a statically-allocated IDR. + * @name: Name of IDR. * * An IDR defined using this macro is ready for use with no additional * initialisation required. It contains no IDs. */ -#define DEFINE_IDR(name) struct idr name = IDR_INIT +#define DEFINE_IDR(name) struct idr name = IDR_INIT(name) /** * idr_get_cursor - Return the current position of the cyclic allocator @@ -218,10 +220,10 @@ struct ida { struct radix_tree_root ida_rt; }; -#define IDA_INIT { \ - .ida_rt = RADIX_TREE_INIT(IDR_RT_MARKER | GFP_NOWAIT), \ +#define IDA_INIT(name) { \ + .ida_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER | GFP_NOWAIT), \ } -#define DEFINE_IDA(name) struct ida name = IDA_INIT +#define DEFINE_IDA(name) struct ida name = IDA_INIT(name) int ida_pre_get(struct ida *ida, gfp_t gfp_mask); int ida_get_new_above(struct ida *ida, int starting_id, int *p_id); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 52b70894eaa5..6a1eb0b0aad9 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -439,7 +439,8 @@ extern long simple_strtol(const char *,char **,unsigned int); extern unsigned long long simple_strtoull(const char *,char **,unsigned int); extern long long simple_strtoll(const char *,char **,unsigned int); -extern int num_to_str(char *buf, int size, unsigned long long num); +extern int num_to_str(char *buf, int size, + unsigned long long num, unsigned int width); /* lib/printf utilities */ @@ -543,6 +544,7 @@ extern enum system_states { SYSTEM_RESTART, } system_state; +/* This cannot be an enum because some may be used in assembly source. */ #define TAINT_PROPRIETARY_MODULE 0 #define TAINT_FORCED_MODULE 1 #define TAINT_CPU_OUT_OF_SPEC 2 @@ -560,7 +562,8 @@ extern enum system_states { #define TAINT_SOFTLOCKUP 14 #define TAINT_LIVEPATCH 15 #define TAINT_AUX 16 -#define TAINT_FLAGS_COUNT 17 +#define TAINT_RANDSTRUCT 17 +#define TAINT_FLAGS_COUNT 18 struct taint_flag { char c_true; /* character printed when tainted */ diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index e251533a5939..89fc8dc7bf38 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -41,11 +41,11 @@ */ /* - * Note about locking : There is no locking required until only * one reader - * and one writer is using the fifo and no kfifo_reset() will be * called - * kfifo_reset_out() can be safely used, until it will be only called + * Note about locking: There is no locking required until only one reader + * and one writer is using the fifo and no kfifo_reset() will be called. + * kfifo_reset_out() can be safely used, until it will be only called * in the reader thread. - * For multiple writer and one reader there is only a need to lock the writer. + * For multiple writer and one reader there is only a need to lock the writer. * And vice versa for only one writer and multiple reader there is only a need * to lock the reader. */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c46016bb25eb..d99b71bc2c66 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -48,13 +48,12 @@ enum memcg_stat_item { MEMCG_NR_STAT, }; -/* Cgroup-specific events, on top of universal VM events */ -enum memcg_event_item { - MEMCG_LOW = NR_VM_EVENT_ITEMS, +enum memcg_memory_event { + MEMCG_LOW, MEMCG_HIGH, MEMCG_MAX, MEMCG_OOM, - MEMCG_NR_EVENTS, + MEMCG_NR_MEMORY_EVENTS, }; struct mem_cgroup_reclaim_cookie { @@ -88,7 +87,7 @@ enum mem_cgroup_events_target { struct mem_cgroup_stat_cpu { long count[MEMCG_NR_STAT]; - unsigned long events[MEMCG_NR_EVENTS]; + unsigned long events[NR_VM_EVENT_ITEMS]; unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; }; @@ -120,6 +119,9 @@ struct mem_cgroup_per_node { unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; + bool congested; /* memcg has many dirty pages */ + /* backed by a congested BDI */ + struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ }; @@ -202,7 +204,8 @@ struct mem_cgroup { /* OOM-Killer disable */ int oom_kill_disable; - /* handle for "memory.events" */ + /* memory.events */ + atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; struct cgroup_file events_file; /* protect arrays of thresholds */ @@ -231,9 +234,10 @@ struct mem_cgroup { struct task_struct *move_lock_task; unsigned long move_lock_flags; + /* memory.stat */ struct mem_cgroup_stat_cpu __percpu *stat_cpu; atomic_long_t stat[MEMCG_NR_STAT]; - atomic_long_t events[MEMCG_NR_EVENTS]; + atomic_long_t events[NR_VM_EVENT_ITEMS]; unsigned long socket_pressure; @@ -645,9 +649,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); -/* idx can be of type enum memcg_event_item or vm_event_item */ static inline void __count_memcg_events(struct mem_cgroup *memcg, - int idx, unsigned long count) + enum vm_event_item idx, + unsigned long count) { unsigned long x; @@ -663,7 +667,8 @@ static inline void __count_memcg_events(struct mem_cgroup *memcg, } static inline void count_memcg_events(struct mem_cgroup *memcg, - int idx, unsigned long count) + enum vm_event_item idx, + unsigned long count) { unsigned long flags; @@ -672,9 +677,8 @@ static inline void count_memcg_events(struct mem_cgroup *memcg, local_irq_restore(flags); } -/* idx can be of type enum memcg_event_item or vm_event_item */ static inline void count_memcg_page_event(struct page *page, - int idx) + enum vm_event_item idx) { if (page->mem_cgroup) count_memcg_events(page->mem_cgroup, idx, 1); @@ -698,10 +702,10 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, rcu_read_unlock(); } -static inline void mem_cgroup_event(struct mem_cgroup *memcg, - enum memcg_event_item event) +static inline void memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event) { - count_memcg_events(memcg, event, 1); + atomic_long_inc(&memcg->memory_events[event]); cgroup_file_notify(&memcg->events_file); } @@ -721,8 +725,8 @@ static inline bool mem_cgroup_disabled(void) return true; } -static inline void mem_cgroup_event(struct mem_cgroup *memcg, - enum memcg_event_item event) +static inline void memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event) { } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 2b0265265c28..e0e49b5b1ee1 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -216,9 +216,6 @@ void put_online_mems(void); void mem_hotplug_begin(void); void mem_hotplug_done(void); -extern void set_zone_contiguous(struct zone *zone); -extern void clear_zone_contiguous(struct zone *zone); - #else /* ! CONFIG_MEMORY_HOTPLUG */ #define pfn_to_online_page(pfn) \ ({ \ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index ab45f8a0d288..f2b4abbca55e 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -7,8 +7,7 @@ #include <linux/migrate_mode.h> #include <linux/hugetlb.h> -typedef struct page *new_page_t(struct page *page, unsigned long private, - int **reason); +typedef struct page *new_page_t(struct page *page, unsigned long private); typedef void free_page_t(struct page *page, unsigned long private); /* @@ -43,9 +42,9 @@ static inline struct page *new_page_nodemask(struct page *page, return alloc_huge_page_nodemask(page_hstate(compound_head(page)), preferred_nid, nodemask); - if (thp_migration_supported() && PageTransHuge(page)) { - order = HPAGE_PMD_ORDER; + if (PageTransHuge(page)) { gfp_mask |= GFP_TRANSHUGE; + order = HPAGE_PMD_ORDER; } if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3ad632366973..1ac1f06a4be6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -747,7 +747,7 @@ int finish_mkwrite_fault(struct vm_fault *vmf); * refcount. The each user mapping also has a reference to the page. * * The pagecache pages are stored in a per-mapping radix tree, which is - * rooted at mapping->page_tree, and indexed by offset. + * rooted at mapping->i_pages, and indexed by offset. * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space * lists, we instead now tag pages as dirty/writeback in the radix tree. * @@ -1466,6 +1466,7 @@ extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned int offset, unsigned int length); +void __set_page_dirty(struct page *, struct address_space *, int warn); int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, @@ -2108,6 +2109,7 @@ extern void setup_per_cpu_pageset(void); extern void zone_pcp_update(struct zone *zone); extern void zone_pcp_reset(struct zone *zone); +extern void setup_zone_pageset(struct zone *zone); /* page_alloc.c */ extern int min_free_kbytes; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f11ae29005f1..32699b2dc52a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -180,6 +180,7 @@ enum node_stat_item { NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ + NR_INDIRECTLY_RECLAIMABLE_BYTES, /* measured in bytes */ NR_VM_NODE_STAT_ITEMS }; @@ -884,7 +885,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; +extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index cdad58bbfd8b..4ae347cbc36d 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -63,7 +63,6 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, bool skip_hwpoisoned_pages); -struct page *alloc_migrate_target(struct page *page, unsigned long private, - int **resultp); +struct page *alloc_migrate_target(struct page *page, unsigned long private); #endif diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 34ce3ebf97d5..b1bd2186e6d2 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -144,7 +144,7 @@ void release_pages(struct page **pages, int nr); * 3. check the page is still in pagecache (if no, goto 1) * * Remove-side that cares about stability of _refcount (eg. reclaim) has the - * following (with tree_lock held for write): + * following (with the i_pages lock held): * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg) * B. remove page from pagecache * C. free the page @@ -157,7 +157,7 @@ void release_pages(struct page **pages, int nr); * * It is possible that between 1 and 2, the page is removed then the exact same * page is inserted into the same position in pagecache. That's OK: the - * old find_get_page using tree_lock could equally have run before or after + * old find_get_page using a lock could equally have run before or after * such a re-insertion, depending on order that locks are granted. * * Lookups racing against pagecache insertion isn't a big problem: either 1 diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index fc55ff31eca7..34149e8b5f73 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -104,25 +104,29 @@ struct radix_tree_node { unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; }; -/* The top bits of gfp_mask are used to store the root tags and the IDR flag */ -#define ROOT_IS_IDR ((__force gfp_t)(1 << __GFP_BITS_SHIFT)) -#define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT + 1) +/* The IDR tag is stored in the low bits of the GFP flags */ +#define ROOT_IS_IDR ((__force gfp_t)4) +/* The top bits of gfp_mask are used to store the root tags */ +#define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT) struct radix_tree_root { + spinlock_t xa_lock; gfp_t gfp_mask; struct radix_tree_node __rcu *rnode; }; -#define RADIX_TREE_INIT(mask) { \ +#define RADIX_TREE_INIT(name, mask) { \ + .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ .gfp_mask = (mask), \ .rnode = NULL, \ } #define RADIX_TREE(name, mask) \ - struct radix_tree_root name = RADIX_TREE_INIT(mask) + struct radix_tree_root name = RADIX_TREE_INIT(name, mask) #define INIT_RADIX_TREE(root, mask) \ do { \ + spin_lock_init(&(root)->xa_lock); \ (root)->gfp_mask = (mask); \ (root)->rnode = NULL; \ } while (0) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 9806184bb3d5..2c570cd934af 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -104,7 +104,8 @@ static inline void mm_update_next_owner(struct mm_struct *mm) #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MMU -extern void arch_pick_mmap_layout(struct mm_struct *mm); +extern void arch_pick_mmap_layout(struct mm_struct *mm, + struct rlimit *rlim_stack); extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); @@ -113,7 +114,8 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #else -static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} +static inline void arch_pick_mmap_layout(struct mm_struct *mm, + struct rlimit *rlim_stack) {} #endif static inline bool in_vfork(struct task_struct *tsk) diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index ab437dd2e3b9..a121982af0f5 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -118,9 +118,14 @@ __printf(2, 3) void seq_printf(struct seq_file *m, const char *fmt, ...); void seq_putc(struct seq_file *m, char c); void seq_puts(struct seq_file *m, const char *s); +void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, + unsigned long long num, unsigned int width); void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, unsigned long long num); void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num); +void seq_put_hex_ll(struct seq_file *m, const char *delimiter, + unsigned long long v, unsigned int width); + void seq_escape(struct seq_file *m, const char *s, const char *esc); void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, @@ -235,4 +240,5 @@ extern struct hlist_node *seq_hlist_start_percpu(struct hlist_head __percpu *hea extern struct hlist_node *seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, int *cpu, loff_t *pos); +void seq_file_init(void); #endif diff --git a/include/linux/utsname.h b/include/linux/utsname.h index c8060c2ecd04..44429d9142ca 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -44,6 +44,8 @@ static inline void put_uts_ns(struct uts_namespace *ns) { kref_put(&ns->kref, free_uts_ns); } + +void uts_ns_init(void); #else static inline void get_uts_ns(struct uts_namespace *ns) { @@ -61,6 +63,10 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags, return old_ns; } + +static inline void uts_ns_init(void) +{ +} #endif #ifdef CONFIG_PROC_SYSCTL diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index a4c2317d8b9f..f25cef84b41d 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -20,6 +20,17 @@ extern int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); #endif +struct reclaim_stat { + unsigned nr_dirty; + unsigned nr_unqueued_dirty; + unsigned nr_congested; + unsigned nr_writeback; + unsigned nr_immediate; + unsigned nr_activate; + unsigned nr_ref_keep; + unsigned nr_unmap_fail; +}; + #ifdef CONFIG_VM_EVENT_COUNTERS /* * Light weight per cpu counter implementation. diff --git a/include/linux/xarray.h b/include/linux/xarray.h new file mode 100644 index 000000000000..2dfc8006fe64 --- /dev/null +++ b/include/linux/xarray.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef _LINUX_XARRAY_H +#define _LINUX_XARRAY_H +/* + * eXtensible Arrays + * Copyright (c) 2017 Microsoft Corporation + * Author: Matthew Wilcox <mawilcox@microsoft.com> + */ + +#include <linux/spinlock.h> + +#define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) +#define xa_lock(xa) spin_lock(&(xa)->xa_lock) +#define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) +#define xa_lock_bh(xa) spin_lock_bh(&(xa)->xa_lock) +#define xa_unlock_bh(xa) spin_unlock_bh(&(xa)->xa_lock) +#define xa_lock_irq(xa) spin_lock_irq(&(xa)->xa_lock) +#define xa_unlock_irq(xa) spin_unlock_irq(&(xa)->xa_lock) +#define xa_lock_irqsave(xa, flags) \ + spin_lock_irqsave(&(xa)->xa_lock, flags) +#define xa_unlock_irqrestore(xa, flags) \ + spin_unlock_irqrestore(&(xa)->xa_lock, flags) + +#endif /* _LINUX_XARRAY_H */ diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 6570c5b45ba1..a1cb91342231 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -346,15 +346,9 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, TP_PROTO(int nid, unsigned long nr_scanned, unsigned long nr_reclaimed, - unsigned long nr_dirty, unsigned long nr_writeback, - unsigned long nr_congested, unsigned long nr_immediate, - unsigned long nr_activate, unsigned long nr_ref_keep, - unsigned long nr_unmap_fail, - int priority, int file), + struct reclaim_stat *stat, int priority, int file), - TP_ARGS(nid, nr_scanned, nr_reclaimed, nr_dirty, nr_writeback, - nr_congested, nr_immediate, nr_activate, nr_ref_keep, - nr_unmap_fail, priority, file), + TP_ARGS(nid, nr_scanned, nr_reclaimed, stat, priority, file), TP_STRUCT__entry( __field(int, nid) @@ -375,13 +369,13 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, __entry->nid = nid; __entry->nr_scanned = nr_scanned; __entry->nr_reclaimed = nr_reclaimed; - __entry->nr_dirty = nr_dirty; - __entry->nr_writeback = nr_writeback; - __entry->nr_congested = nr_congested; - __entry->nr_immediate = nr_immediate; - __entry->nr_activate = nr_activate; - __entry->nr_ref_keep = nr_ref_keep; - __entry->nr_unmap_fail = nr_unmap_fail; + __entry->nr_dirty = stat->nr_dirty; + __entry->nr_writeback = stat->nr_writeback; + __entry->nr_congested = stat->nr_congested; + __entry->nr_immediate = stat->nr_immediate; + __entry->nr_activate = stat->nr_activate; + __entry->nr_ref_keep = stat->nr_ref_keep; + __entry->nr_unmap_fail = stat->nr_unmap_fail; __entry->priority = priority; __entry->reclaim_flags = trace_shrink_flags(file); ), diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index f8b134f5608f..e7ee32861d51 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -27,6 +27,9 @@ # define MAP_UNINITIALIZED 0x0 /* Don't support this flag */ #endif +/* 0x0100 - 0x80000 flags are defined in asm-generic/mman.h */ +#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ + /* * Flags for mlock */ diff --git a/include/uapi/linux/const.h b/include/uapi/linux/const.h index 92537757590a..5ed721ad5b19 100644 --- a/include/uapi/linux/const.h +++ b/include/uapi/linux/const.h @@ -1,8 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* const.h: Macros for dealing with constants. */ -#ifndef _LINUX_CONST_H -#define _LINUX_CONST_H +#ifndef _UAPI_LINUX_CONST_H +#define _UAPI_LINUX_CONST_H /* Some constant macros are used in both assembler and * C code. Therefore we cannot annotate them always with @@ -22,7 +22,10 @@ #define _AT(T,X) ((T)(X)) #endif -#define _BITUL(x) (_AC(1,UL) << (x)) -#define _BITULL(x) (_AC(1,ULL) << (x)) +#define _UL(x) (_AC(x, UL)) +#define _ULL(x) (_AC(x, ULL)) -#endif /* !(_LINUX_CONST_H) */ +#define _BITUL(x) (_UL(1) << (x)) +#define _BITULL(x) (_ULL(1) << (x)) + +#endif /* _UAPI_LINUX_CONST_H */ diff --git a/include/uapi/linux/msg.h b/include/uapi/linux/msg.h index 5d5ab81dc9be..e4a0d9a9a9e8 100644 --- a/include/uapi/linux/msg.h +++ b/include/uapi/linux/msg.h @@ -7,6 +7,7 @@ /* ipcs ctl commands */ #define MSG_STAT 11 #define MSG_INFO 12 +#define MSG_STAT_ANY 13 /* msgrcv options */ #define MSG_NOERROR 010000 /* no error if message is too big */ diff --git a/include/uapi/linux/sem.h b/include/uapi/linux/sem.h index 9c3e745b0656..39a1876f039e 100644 --- a/include/uapi/linux/sem.h +++ b/include/uapi/linux/sem.h @@ -19,6 +19,7 @@ /* ipcs ctl cmds */ #define SEM_STAT 18 #define SEM_INFO 19 +#define SEM_STAT_ANY 20 /* Obsolete, used only for backwards compatibility and libc5 compiles */ struct semid_ds { diff --git a/include/uapi/linux/shm.h b/include/uapi/linux/shm.h index 4de12a39b075..dde1344f047c 100644 --- a/include/uapi/linux/shm.h +++ b/include/uapi/linux/shm.h @@ -83,8 +83,9 @@ struct shmid_ds { #define SHM_UNLOCK 12 /* ipcs ctl commands */ -#define SHM_STAT 13 -#define SHM_INFO 14 +#define SHM_STAT 13 +#define SHM_INFO 14 +#define SHM_STAT_ANY 15 /* Obsolete, used only for backwards compatibility */ struct shminfo { diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c index 12c159824c7b..035a5f0ab26b 100644 --- a/init/do_mounts_rd.c +++ b/init/do_mounts_rd.c @@ -255,7 +255,7 @@ int __init rd_load_image(char *from) nblocks, ((nblocks-1)/devblocks)+1, nblocks>devblocks ? "s" : ""); for (i = 0, disk = 1; i < nblocks; i++) { if (i && (i % devblocks == 0)) { - printk("done disk #%d.\n", disk++); + pr_cont("done disk #%d.\n", disk++); rotate = 0; if (ksys_close(in_fd)) { printk("Error closing the disk.\n"); @@ -278,7 +278,7 @@ int __init rd_load_image(char *from) } #endif } - printk("done.\n"); + pr_cont("done.\n"); successful_load: res = 1; diff --git a/init/main.c b/init/main.c index d499f4a80e0b..b795aa341a3a 100644 --- a/init/main.c +++ b/init/main.c @@ -51,6 +51,7 @@ #include <linux/taskstats_kern.h> #include <linux/delayacct.h> #include <linux/unistd.h> +#include <linux/utsname.h> #include <linux/rmap.h> #include <linux/mempolicy.h> #include <linux/key.h> @@ -706,6 +707,7 @@ asmlinkage __visible void __init start_kernel(void) cred_init(); fork_init(); proc_caches_init(); + uts_ns_init(); buffer_init(); key_init(); security_init(); @@ -713,6 +715,7 @@ asmlinkage __visible void __init start_kernel(void) vfs_caches_init(); pagecache_init(); signals_init(); + seq_file_init(); proc_root_init(); nsfs_init(); cpuset_init(); diff --git a/ipc/msg.c b/ipc/msg.c index 114a21189613..56fd1c73eedc 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -497,14 +497,14 @@ static int msgctl_stat(struct ipc_namespace *ns, int msqid, memset(p, 0, sizeof(*p)); rcu_read_lock(); - if (cmd == MSG_STAT) { + if (cmd == MSG_STAT || cmd == MSG_STAT_ANY) { msq = msq_obtain_object(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); goto out_unlock; } id = msq->q_perm.id; - } else { + } else { /* IPC_STAT */ msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); @@ -512,9 +512,14 @@ static int msgctl_stat(struct ipc_namespace *ns, int msqid, } } - err = -EACCES; - if (ipcperms(ns, &msq->q_perm, S_IRUGO)) - goto out_unlock; + /* see comment for SHM_STAT_ANY */ + if (cmd == MSG_STAT_ANY) + audit_ipc_obj(&msq->q_perm); + else { + err = -EACCES; + if (ipcperms(ns, &msq->q_perm, S_IRUGO)) + goto out_unlock; + } err = security_msg_queue_msgctl(&msq->q_perm, cmd); if (err) @@ -572,6 +577,7 @@ long ksys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) return err; } case MSG_STAT: /* msqid is an index rather than a msg queue id */ + case MSG_STAT_ANY: case IPC_STAT: err = msgctl_stat(ns, msqid, cmd, &msqid64); if (err < 0) @@ -690,6 +696,7 @@ long compat_ksys_msgctl(int msqid, int cmd, void __user *uptr) } case IPC_STAT: case MSG_STAT: + case MSG_STAT_ANY: err = msgctl_stat(ns, msqid, cmd, &msqid64); if (err < 0) return err; diff --git a/ipc/sem.c b/ipc/sem.c index 2994da8ccc7f..06be75d9217a 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1220,14 +1220,14 @@ static int semctl_stat(struct ipc_namespace *ns, int semid, memset(semid64, 0, sizeof(*semid64)); rcu_read_lock(); - if (cmd == SEM_STAT) { + if (cmd == SEM_STAT || cmd == SEM_STAT_ANY) { sma = sem_obtain_object(ns, semid); if (IS_ERR(sma)) { err = PTR_ERR(sma); goto out_unlock; } id = sma->sem_perm.id; - } else { + } else { /* IPC_STAT */ sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { err = PTR_ERR(sma); @@ -1235,9 +1235,14 @@ static int semctl_stat(struct ipc_namespace *ns, int semid, } } - err = -EACCES; - if (ipcperms(ns, &sma->sem_perm, S_IRUGO)) - goto out_unlock; + /* see comment for SHM_STAT_ANY */ + if (cmd == SEM_STAT_ANY) + audit_ipc_obj(&sma->sem_perm); + else { + err = -EACCES; + if (ipcperms(ns, &sma->sem_perm, S_IRUGO)) + goto out_unlock; + } err = security_sem_semctl(&sma->sem_perm, cmd); if (err) @@ -1626,6 +1631,7 @@ long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg) return semctl_info(ns, semid, cmd, p); case IPC_STAT: case SEM_STAT: + case SEM_STAT_ANY: err = semctl_stat(ns, semid, cmd, &semid64); if (err < 0) return err; @@ -1732,6 +1738,7 @@ long compat_ksys_semctl(int semid, int semnum, int cmd, int arg) return semctl_info(ns, semid, cmd, p); case IPC_STAT: case SEM_STAT: + case SEM_STAT_ANY: err = semctl_stat(ns, semid, cmd, &semid64); if (err < 0) return err; diff --git a/ipc/shm.c b/ipc/shm.c index acefe44fefef..5639345dbec9 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -415,7 +415,7 @@ static int shm_split(struct vm_area_struct *vma, unsigned long addr) struct file *file = vma->vm_file; struct shm_file_data *sfd = shm_file_data(file); - if (sfd->vm_ops && sfd->vm_ops->split) + if (sfd->vm_ops->split) return sfd->vm_ops->split(vma, addr); return 0; @@ -947,14 +947,14 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid, memset(tbuf, 0, sizeof(*tbuf)); rcu_read_lock(); - if (cmd == SHM_STAT) { + if (cmd == SHM_STAT || cmd == SHM_STAT_ANY) { shp = shm_obtain_object(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); goto out_unlock; } id = shp->shm_perm.id; - } else { + } else { /* IPC_STAT */ shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); @@ -962,9 +962,20 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid, } } - err = -EACCES; - if (ipcperms(ns, &shp->shm_perm, S_IRUGO)) - goto out_unlock; + /* + * Semantically SHM_STAT_ANY ought to be identical to + * that functionality provided by the /proc/sysvipc/ + * interface. As such, only audit these calls and + * do not do traditional S_IRUGO permission checks on + * the ipc object. + */ + if (cmd == SHM_STAT_ANY) + audit_ipc_obj(&shp->shm_perm); + else { + err = -EACCES; + if (ipcperms(ns, &shp->shm_perm, S_IRUGO)) + goto out_unlock; + } err = security_shm_shmctl(&shp->shm_perm, cmd); if (err) @@ -1104,6 +1115,7 @@ long ksys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) return err; } case SHM_STAT: + case SHM_STAT_ANY: case IPC_STAT: { err = shmctl_stat(ns, shmid, cmd, &sem64); if (err < 0) @@ -1282,6 +1294,7 @@ long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr) return err; } case IPC_STAT: + case SHM_STAT_ANY: case SHM_STAT: err = shmctl_stat(ns, shmid, cmd, &sem64); if (err < 0) diff --git a/ipc/util.c b/ipc/util.c index 3783b7991cc7..4e81182fa0ac 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -89,6 +89,7 @@ static int __init ipc_init(void) { int err_sem, err_msg; + proc_mkdir("sysvipc", NULL); err_sem = sem_init(); WARN(err_sem, "ipc: sysv sem_init failed: %d\n", err_sem); err_msg = msg_init(); diff --git a/kernel/panic.c b/kernel/panic.c index 6c3b08cd1139..42e487488554 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,7 +34,8 @@ #define PANIC_BLINK_SPD 18 int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; -static unsigned long tainted_mask; +static unsigned long tainted_mask = + IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); @@ -308,52 +309,40 @@ EXPORT_SYMBOL(panic); * is being removed anyway. */ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { - { 'P', 'G', true }, /* TAINT_PROPRIETARY_MODULE */ - { 'F', ' ', true }, /* TAINT_FORCED_MODULE */ - { 'S', ' ', false }, /* TAINT_CPU_OUT_OF_SPEC */ - { 'R', ' ', false }, /* TAINT_FORCED_RMMOD */ - { 'M', ' ', false }, /* TAINT_MACHINE_CHECK */ - { 'B', ' ', false }, /* TAINT_BAD_PAGE */ - { 'U', ' ', false }, /* TAINT_USER */ - { 'D', ' ', false }, /* TAINT_DIE */ - { 'A', ' ', false }, /* TAINT_OVERRIDDEN_ACPI_TABLE */ - { 'W', ' ', false }, /* TAINT_WARN */ - { 'C', ' ', true }, /* TAINT_CRAP */ - { 'I', ' ', false }, /* TAINT_FIRMWARE_WORKAROUND */ - { 'O', ' ', true }, /* TAINT_OOT_MODULE */ - { 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */ - { 'L', ' ', false }, /* TAINT_SOFTLOCKUP */ - { 'K', ' ', true }, /* TAINT_LIVEPATCH */ - { 'X', ' ', true }, /* TAINT_AUX */ + [ TAINT_PROPRIETARY_MODULE ] = { 'P', 'G', true }, + [ TAINT_FORCED_MODULE ] = { 'F', ' ', true }, + [ TAINT_CPU_OUT_OF_SPEC ] = { 'S', ' ', false }, + [ TAINT_FORCED_RMMOD ] = { 'R', ' ', false }, + [ TAINT_MACHINE_CHECK ] = { 'M', ' ', false }, + [ TAINT_BAD_PAGE ] = { 'B', ' ', false }, + [ TAINT_USER ] = { 'U', ' ', false }, + [ TAINT_DIE ] = { 'D', ' ', false }, + [ TAINT_OVERRIDDEN_ACPI_TABLE ] = { 'A', ' ', false }, + [ TAINT_WARN ] = { 'W', ' ', false }, + [ TAINT_CRAP ] = { 'C', ' ', true }, + [ TAINT_FIRMWARE_WORKAROUND ] = { 'I', ' ', false }, + [ TAINT_OOT_MODULE ] = { 'O', ' ', true }, + [ TAINT_UNSIGNED_MODULE ] = { 'E', ' ', true }, + [ TAINT_SOFTLOCKUP ] = { 'L', ' ', false }, + [ TAINT_LIVEPATCH ] = { 'K', ' ', true }, + [ TAINT_AUX ] = { 'X', ' ', true }, + [ TAINT_RANDSTRUCT ] = { 'T', ' ', true }, }; /** - * print_tainted - return a string to represent the kernel taint state. + * print_tainted - return a string to represent the kernel taint state. * - * 'P' - Proprietary module has been loaded. - * 'F' - Module has been forcibly loaded. - * 'S' - SMP with CPUs not designed for SMP. - * 'R' - User forced a module unload. - * 'M' - System experienced a machine check exception. - * 'B' - System has hit bad_page. - * 'U' - Userspace-defined naughtiness. - * 'D' - Kernel has oopsed before - * 'A' - ACPI table overridden. - * 'W' - Taint on warning. - * 'C' - modules from drivers/staging are loaded. - * 'I' - Working around severe firmware bug. - * 'O' - Out-of-tree module has been loaded. - * 'E' - Unsigned module has been loaded. - * 'L' - A soft lockup has previously occurred. - * 'K' - Kernel has been live patched. - * 'X' - Auxiliary taint, for distros' use. + * For individual taint flag meanings, see Documentation/sysctl/kernel.txt * - * The string is overwritten by the next call to print_tainted(). + * The string is overwritten by the next call to print_tainted(), + * but is always NULL terminated. */ const char *print_tainted(void) { static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")]; + BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT); + if (tainted_mask) { char *s; int i; diff --git a/kernel/params.c b/kernel/params.c index cc9108c2a1fd..ce89f757e6da 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -111,8 +111,8 @@ bool parameq(const char *a, const char *b) static void param_check_unsafe(const struct kernel_param *kp) { if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { - pr_warn("Setting dangerous option %s - tainting kernel\n", - kp->name); + pr_notice("Setting dangerous option %s - tainting kernel\n", + kp->name); add_taint(TAINT_USER, LOCKDEP_STILL_OK); } } diff --git a/kernel/pid.c b/kernel/pid.c index ed6c343fe50d..157fe4b19971 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -70,7 +70,7 @@ int pid_max_max = PID_MAX_LIMIT; */ struct pid_namespace init_pid_ns = { .kref = KREF_INIT(2), - .idr = IDR_INIT, + .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bdf7090b106d..6a78cf70761d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1340,7 +1340,7 @@ static struct ctl_table vm_table[] = { { .procname = "dirtytime_expire_seconds", .data = &dirtytime_expire_interval, - .maxlen = sizeof(dirty_expire_interval), + .maxlen = sizeof(dirtytime_expire_interval), .mode = 0644, .proc_handler = dirtytime_interval_handler, .extra1 = &zero, @@ -2511,6 +2511,15 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, } #endif +/** + * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure + * @min: pointer to minimum allowable value + * @max: pointer to maximum allowable value + * + * The do_proc_dointvec_minmax_conv_param structure provides the + * minimum and maximum values for doing range checking for those sysctl + * parameters that use the proc_dointvec_minmax() handler. + */ struct do_proc_dointvec_minmax_conv_param { int *min; int *max; @@ -2554,7 +2563,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). * - * Returns 0 on success. + * Returns 0 on success or -EINVAL on write when the range check fails. */ int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -2567,6 +2576,15 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, do_proc_dointvec_minmax_conv, ¶m); } +/** + * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure + * @min: pointer to minimum allowable value + * @max: pointer to maximum allowable value + * + * The do_proc_douintvec_minmax_conv_param structure provides the + * minimum and maximum values for doing range checking for those sysctl + * parameters that use the proc_douintvec_minmax() handler. + */ struct do_proc_douintvec_minmax_conv_param { unsigned int *min; unsigned int *max; @@ -2614,7 +2632,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, * check for UINT_MAX to avoid having to support wrap around uses from * userspace. * - * Returns 0 on success. + * Returns 0 on success or -ERANGE on write when the range check fails. */ int proc_douintvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) diff --git a/kernel/utsname.c b/kernel/utsname.c index 913fe4336d2b..dcd6be1996fe 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -19,6 +19,8 @@ #include <linux/proc_ns.h> #include <linux/sched/task.h> +static struct kmem_cache *uts_ns_cache __ro_after_init; + static struct ucounts *inc_uts_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES); @@ -33,7 +35,7 @@ static struct uts_namespace *create_uts_ns(void) { struct uts_namespace *uts_ns; - uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); + uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL); if (uts_ns) kref_init(&uts_ns->kref); return uts_ns; @@ -42,7 +44,7 @@ static struct uts_namespace *create_uts_ns(void) /* * Clone a new ns copying an original utsname, setting refcount to 1 * @old_ns: namespace to clone - * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise + * Return ERR_PTR(-ENOMEM) on error (failure to allocate), new ns otherwise */ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) @@ -75,7 +77,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, return ns; fail_free: - kfree(ns); + kmem_cache_free(uts_ns_cache, ns); fail_dec: dec_uts_namespaces(ucounts); fail: @@ -113,7 +115,7 @@ void free_uts_ns(struct kref *kref) dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); - kfree(ns); + kmem_cache_free(uts_ns_cache, ns); } static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) @@ -169,3 +171,13 @@ const struct proc_ns_operations utsns_operations = { .install = utsns_install, .owner = utsns_owner, }; + +void __init uts_ns_init(void) +{ + uts_ns_cache = kmem_cache_create_usercopy( + "uts_namespace", sizeof(struct uts_namespace), 0, + SLAB_PANIC|SLAB_ACCOUNT, + offsetof(struct uts_namespace, name), + sizeof_field(struct uts_namespace, name), + NULL); +} diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 51c6bf0d93c6..c40c7b734cd1 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -800,6 +800,30 @@ config SOFTLOCKUP_DETECTOR chance to run. The current stack trace is displayed upon detection and the system will stay locked up. +config BOOTPARAM_SOFTLOCKUP_PANIC + bool "Panic (Reboot) On Soft Lockups" + depends on SOFTLOCKUP_DETECTOR + help + Say Y here to enable the kernel to panic on "soft lockups", + which are bugs that cause the kernel to loop in kernel + mode for more than 20 seconds (configurable using the watchdog_thresh + sysctl), without giving other tasks a chance to run. + + The panic can be used in combination with panic_timeout, + to cause the system to reboot automatically after a + lockup has been detected. This feature is useful for + high-availability systems that have uptime guarantees and + where a lockup must be resolved ASAP. + + Say N if unsure. + +config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE + int + depends on SOFTLOCKUP_DETECTOR + range 0 1 + default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC + default 1 if BOOTPARAM_SOFTLOCKUP_PANIC + config HARDLOCKUP_DETECTOR_PERF bool select SOFTLOCKUP_DETECTOR @@ -849,30 +873,6 @@ config BOOTPARAM_HARDLOCKUP_PANIC_VALUE default 0 if !BOOTPARAM_HARDLOCKUP_PANIC default 1 if BOOTPARAM_HARDLOCKUP_PANIC -config BOOTPARAM_SOFTLOCKUP_PANIC - bool "Panic (Reboot) On Soft Lockups" - depends on SOFTLOCKUP_DETECTOR - help - Say Y here to enable the kernel to panic on "soft lockups", - which are bugs that cause the kernel to loop in kernel - mode for more than 20 seconds (configurable using the watchdog_thresh - sysctl), without giving other tasks a chance to run. - - The panic can be used in combination with panic_timeout, - to cause the system to reboot automatically after a - lockup has been detected. This feature is useful for - high-availability systems that have uptime guarantees and - where a lockup must be resolved ASAP. - - Say N if unsure. - -config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE - int - depends on SOFTLOCKUP_DETECTOR - range 0 1 - default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC - default 1 if BOOTPARAM_SOFTLOCKUP_PANIC - config DETECT_HUNG_TASK bool "Detect Hung Tasks" depends on DEBUG_KERNEL diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index a669c193b878..19d42ea75ec2 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -46,3 +46,10 @@ config UBSAN_NULL help This option enables detection of memory accesses via a null pointer. + +config TEST_UBSAN + tristate "Module for testing for undefined behavior detection" + depends on m && UBSAN + help + This is a test module for UBSAN. + It triggers various undefined behavior, and detect it. diff --git a/lib/Makefile b/lib/Makefile index 8fc0d3a9b34f..ce20696d5a92 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -53,6 +53,9 @@ obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o obj-$(CONFIG_TEST_KASAN) += test_kasan.o +CFLAGS_test_kasan.o += -fno-builtin +obj-$(CONFIG_TEST_UBSAN) += test_ubsan.o +UBSAN_SANITIZE_test_ubsan.o := y obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o obj-$(CONFIG_TEST_LKM) += test_module.o diff --git a/lib/list_debug.c b/lib/list_debug.c index a34db8d27667..5d5424b51b74 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -21,13 +21,13 @@ bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { if (CHECK_DATA_CORRUPTION(next->prev != prev, - "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", + "list_add corruption. next->prev should be prev (%px), but was %px. (next=%px).\n", prev, next->prev, next) || CHECK_DATA_CORRUPTION(prev->next != next, - "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", + "list_add corruption. prev->next should be next (%px), but was %px. (prev=%px).\n", next, prev->next, prev) || CHECK_DATA_CORRUPTION(new == prev || new == next, - "list_add double add: new=%p, prev=%p, next=%p.\n", + "list_add double add: new=%px, prev=%px, next=%px.\n", new, prev, next)) return false; @@ -43,16 +43,16 @@ bool __list_del_entry_valid(struct list_head *entry) next = entry->next; if (CHECK_DATA_CORRUPTION(next == LIST_POISON1, - "list_del corruption, %p->next is LIST_POISON1 (%p)\n", + "list_del corruption, %px->next is LIST_POISON1 (%px)\n", entry, LIST_POISON1) || CHECK_DATA_CORRUPTION(prev == LIST_POISON2, - "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", + "list_del corruption, %px->prev is LIST_POISON2 (%px)\n", entry, LIST_POISON2) || CHECK_DATA_CORRUPTION(prev->next != entry, - "list_del corruption. prev->next should be %p, but was %p\n", + "list_del corruption. prev->next should be %px, but was %px\n", entry, prev->next) || CHECK_DATA_CORRUPTION(next->prev != entry, - "list_del corruption. next->prev should be %p, but was %p\n", + "list_del corruption. next->prev should be %px, but was %px\n", entry, next->prev)) return false; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8e00138d593f..da9e10c827df 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -146,7 +146,7 @@ static unsigned int radix_tree_descend(const struct radix_tree_node *parent, static inline gfp_t root_gfp_mask(const struct radix_tree_root *root) { - return root->gfp_mask & __GFP_BITS_MASK; + return root->gfp_mask & (__GFP_BITS_MASK & ~GFP_ZONEMASK); } static inline void tag_set(struct radix_tree_node *node, unsigned int tag, @@ -2285,6 +2285,7 @@ void __init radix_tree_init(void) int ret; BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32); + BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK); radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 413367cf569e..de16f7869fb1 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -296,15 +296,17 @@ static void __init test_bitmap_parselist(void) } } +#define EXP_BYTES (sizeof(exp) * 8) + static void __init test_bitmap_arr32(void) { - unsigned int nbits, next_bit, len = sizeof(exp) * 8; + unsigned int nbits, next_bit; u32 arr[sizeof(exp) / 4]; - DECLARE_BITMAP(bmap2, len); + DECLARE_BITMAP(bmap2, EXP_BYTES); memset(arr, 0xa5, sizeof(arr)); - for (nbits = 0; nbits < len; ++nbits) { + for (nbits = 0; nbits < EXP_BYTES; ++nbits) { bitmap_to_arr32(arr, exp, nbits); bitmap_from_arr32(bmap2, arr, nbits); expect_eq_bitmap(bmap2, exp, nbits); @@ -316,7 +318,7 @@ static void __init test_bitmap_arr32(void) " tail is not safely cleared: %d\n", nbits, next_bit); - if (nbits < len - 32) + if (nbits < EXP_BYTES - 32) expect_eq_uint(arr[DIV_ROUND_UP(nbits, 32)], 0xa5a5a5a5); } diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 98854a64b014..ec657105edbf 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -567,7 +567,15 @@ static noinline void __init kmem_cache_invalid_free(void) return; } + /* Trigger invalid free, the object doesn't get freed */ kmem_cache_free(cache, p + 1); + + /* + * Properly free the object to prevent the "Objects remaining in + * test_cache on __kmem_cache_shutdown" BUG failure. + */ + kmem_cache_free(cache, p); + kmem_cache_destroy(cache); } diff --git a/lib/test_ubsan.c b/lib/test_ubsan.c new file mode 100644 index 000000000000..280f4979d00e --- /dev/null +++ b/lib/test_ubsan.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> + +typedef void(*test_ubsan_fp)(void); + +static void test_ubsan_add_overflow(void) +{ + volatile int val = INT_MAX; + + val += 2; +} + +static void test_ubsan_sub_overflow(void) +{ + volatile int val = INT_MIN; + volatile int val2 = 2; + + val -= val2; +} + +static void test_ubsan_mul_overflow(void) +{ + volatile int val = INT_MAX / 2; + + val *= 3; +} + +static void test_ubsan_negate_overflow(void) +{ + volatile int val = INT_MIN; + + val = -val; +} + +static void test_ubsan_divrem_overflow(void) +{ + volatile int val = 16; + volatile int val2 = 0; + + val /= val2; +} + +static void test_ubsan_vla_bound_not_positive(void) +{ + volatile int size = -1; + char buf[size]; + + (void)buf; +} + +static void test_ubsan_shift_out_of_bounds(void) +{ + volatile int val = -1; + int val2 = 10; + + val2 <<= val; +} + +static void test_ubsan_out_of_bounds(void) +{ + volatile int i = 4, j = 5; + volatile int arr[i]; + + arr[j] = i; +} + +static void test_ubsan_load_invalid_value(void) +{ + volatile char *dst, *src; + bool val, val2, *ptr; + char c = 4; + + dst = (char *)&val; + src = &c; + *dst = *src; + + ptr = &val2; + val2 = val; +} + +static void test_ubsan_null_ptr_deref(void) +{ + volatile int *ptr = NULL; + int val; + + val = *ptr; +} + +static void test_ubsan_misaligned_access(void) +{ + volatile char arr[5] __aligned(4) = {1, 2, 3, 4, 5}; + volatile int *ptr, val = 6; + + ptr = (int *)(arr + 1); + *ptr = val; +} + +static void test_ubsan_object_size_mismatch(void) +{ + /* "((aligned(8)))" helps this not into be misaligned for ptr-access. */ + volatile int val __aligned(8) = 4; + volatile long long *ptr, val2; + + ptr = (long long *)&val; + val2 = *ptr; +} + +static const test_ubsan_fp test_ubsan_array[] = { + test_ubsan_add_overflow, + test_ubsan_sub_overflow, + test_ubsan_mul_overflow, + test_ubsan_negate_overflow, + test_ubsan_divrem_overflow, + test_ubsan_vla_bound_not_positive, + test_ubsan_shift_out_of_bounds, + test_ubsan_out_of_bounds, + test_ubsan_load_invalid_value, + //test_ubsan_null_ptr_deref, /* exclude it because there is a crash */ + test_ubsan_misaligned_access, + test_ubsan_object_size_mismatch, +}; + +static int __init test_ubsan_init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(test_ubsan_array); i++) + test_ubsan_array[i](); + + (void)test_ubsan_null_ptr_deref; /* to avoid unsed-function warning */ + return 0; +} +module_init(test_ubsan_init); + +static void __exit test_ubsan_exit(void) +{ + /* do nothing */ +} +module_exit(test_ubsan_exit); + +MODULE_AUTHOR("Jinbum Park <jinb.park7@gmail.com>"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 89f8a4a4b770..30c0cb8cc9bc 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -336,7 +336,7 @@ char *put_dec(char *buf, unsigned long long n) * * If speed is not important, use snprintf(). It's easy to read the code. */ -int num_to_str(char *buf, int size, unsigned long long num) +int num_to_str(char *buf, int size, unsigned long long num, unsigned int width) { /* put_dec requires 2-byte alignment of the buffer. */ char tmp[sizeof(num) * 3] __aligned(2); @@ -350,11 +350,21 @@ int num_to_str(char *buf, int size, unsigned long long num) len = put_dec(tmp, num) - tmp; } - if (len > size) + if (len > size || width > size) return 0; + + if (width > len) { + width = width - len; + for (idx = 0; idx < width; idx++) + buf[idx] = ' '; + } else { + width = 0; + } + for (idx = 0; idx < len; ++idx) - buf[idx] = tmp[len - idx - 1]; - return len; + buf[idx + width] = tmp[len - idx - 1]; + + return len + width; } #define SIGN 1 /* unsigned/signed, must be 1 */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 08b9aab631ab..023190c69dce 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1020,23 +1020,18 @@ EXPORT_SYMBOL(congestion_wait); /** * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes - * @pgdat: A pgdat to check if it is heavily congested * @sync: SYNC or ASYNC IO * @timeout: timeout in jiffies * - * In the event of a congested backing_dev (any backing_dev) and the given - * @pgdat has experienced recent congestion, this waits for up to @timeout - * jiffies for either a BDI to exit congestion of the given @sync queue - * or a write to complete. - * - * In the absence of pgdat congestion, cond_resched() is called to yield - * the processor if necessary but otherwise does not sleep. + * In the event of a congested backing_dev (any backing_dev) this waits + * for up to @timeout jiffies for either a BDI to exit congestion of the + * given @sync queue or a write to complete. * * The return value is 0 if the sleep is for the full timeout. Otherwise, * it is the number of jiffies that were still remaining when the function * returned. return_value == timeout implies the function did not sleep. */ -long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout) +long wait_iff_congested(int sync, long timeout) { long ret; unsigned long start = jiffies; @@ -1044,12 +1039,10 @@ long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout) wait_queue_head_t *wqh = &congestion_wqh[sync]; /* - * If there is no congestion, or heavy congestion is not being - * encountered in the current pgdat, yield if necessary instead + * If there is no congestion, yield if necessary instead * of sleeping on the congestion queue */ - if (atomic_read(&nr_wb_congested[sync]) == 0 || - !test_bit(PGDAT_CONGESTED, &pgdat->flags)) { + if (atomic_read(&nr_wb_congested[sync]) == 0) { cond_resched(); /* In case we scheduled, work out time remaining */ @@ -39,6 +39,7 @@ #include <trace/events/cma.h> #include "cma.h" +#include "internal.h" struct cma cma_areas[MAX_CMA_AREAS]; unsigned cma_area_count; @@ -109,23 +110,25 @@ static int __init cma_activate_area(struct cma *cma) if (!cma->bitmap) return -ENOMEM; - WARN_ON_ONCE(!pfn_valid(pfn)); - zone = page_zone(pfn_to_page(pfn)); - do { unsigned j; base_pfn = pfn; + if (!pfn_valid(base_pfn)) + goto err; + + zone = page_zone(pfn_to_page(base_pfn)); for (j = pageblock_nr_pages; j; --j, pfn++) { - WARN_ON_ONCE(!pfn_valid(pfn)); + if (!pfn_valid(pfn)) + goto err; + /* - * alloc_contig_range requires the pfn range - * specified to be in the same zone. Make this - * simple by forcing the entire CMA resv range - * to be in the same zone. + * In init_cma_reserved_pageblock(), present_pages + * is adjusted with assumption that all pages in + * the pageblock come from a single zone. */ if (page_zone(pfn_to_page(pfn)) != zone) - goto not_in_zone; + goto err; } init_cma_reserved_pageblock(pfn_to_page(base_pfn)); } while (--i); @@ -139,7 +142,7 @@ static int __init cma_activate_area(struct cma *cma) return 0; -not_in_zone: +err: pr_err("CMA area %s could not be activated\n", cma->name); kfree(cma->bitmap); cma->count = 0; @@ -149,6 +152,41 @@ not_in_zone: static int __init cma_init_reserved_areas(void) { int i; + struct zone *zone; + pg_data_t *pgdat; + + if (!cma_area_count) + return 0; + + for_each_online_pgdat(pgdat) { + unsigned long start_pfn = UINT_MAX, end_pfn = 0; + + zone = &pgdat->node_zones[ZONE_MOVABLE]; + + /* + * In this case, we cannot adjust the zone range + * since it is now maximum node span and we don't + * know original zone range. + */ + if (populated_zone(zone)) + continue; + + for (i = 0; i < cma_area_count; i++) { + if (pfn_to_nid(cma_areas[i].base_pfn) != + pgdat->node_id) + continue; + + start_pfn = min(start_pfn, cma_areas[i].base_pfn); + end_pfn = max(end_pfn, cma_areas[i].base_pfn + + cma_areas[i].count); + } + + if (!end_pfn) + continue; + + zone->zone_start_pfn = start_pfn; + zone->spanned_pages = end_pfn - start_pfn; + } for (i = 0; i < cma_area_count; i++) { int ret = cma_activate_area(&cma_areas[i]); @@ -157,9 +195,32 @@ static int __init cma_init_reserved_areas(void) return ret; } + /* + * Reserved pages for ZONE_MOVABLE are now activated and + * this would change ZONE_MOVABLE's managed page counter and + * the other zones' present counter. We need to re-calculate + * various zone information that depends on this initialization. + */ + build_all_zonelists(NULL); + for_each_populated_zone(zone) { + if (zone_idx(zone) == ZONE_MOVABLE) { + zone_pcp_reset(zone); + setup_zone_pageset(zone); + } else + zone_pcp_update(zone); + + set_zone_contiguous(zone); + } + + /* + * We need to re-init per zone wmark by calling + * init_per_zone_wmark_min() but doesn't call here because it is + * registered on core_initcall and it will be called later than us. + */ + return 0; } -core_initcall(cma_init_reserved_areas); +pure_initcall(cma_init_reserved_areas); /** * cma_init_reserved_mem() - create custom contiguous area from reserved memory diff --git a/mm/compaction.c b/mm/compaction.c index 88d01a50a015..028b7210a669 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1166,8 +1166,7 @@ static void isolate_freepages(struct compact_control *cc) * from the isolated freelists in the block we are migrating to. */ static struct page *compaction_alloc(struct page *migratepage, - unsigned long data, - int **result) + unsigned long data) { struct compact_control *cc = (struct compact_control *)data; struct page *freepage; @@ -1451,14 +1450,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * if compaction succeeds. * For costly orders, we require low watermark instead of min for * compaction to proceed to increase its chances. - * ALLOC_CMA is used, as pages in CMA pageblocks are considered - * suitable migration targets */ watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, - ALLOC_CMA, wmark_target)) + 0, wmark_target)) return COMPACT_SKIPPED; return COMPACT_CONTINUE; diff --git a/mm/filemap.c b/mm/filemap.c index 693f62212a59..ab77e19ab09c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -66,7 +66,7 @@ * ->i_mmap_rwsem (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) - * ->mapping->tree_lock + * ->i_pages lock * * ->i_mutex * ->i_mmap_rwsem (truncate->unmap_mapping_range) @@ -74,7 +74,7 @@ * ->mmap_sem * ->i_mmap_rwsem * ->page_table_lock or pte_lock (various, mainly in memory.c) - * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) + * ->i_pages lock (arch-dependent flush_dcache_mmap_lock) * * ->mmap_sem * ->lock_page (access_process_vm) @@ -84,7 +84,7 @@ * * bdi->wb.list_lock * sb_lock (fs/fs-writeback.c) - * ->mapping->tree_lock (__sync_single_inode) + * ->i_pages lock (__sync_single_inode) * * ->i_mmap_rwsem * ->anon_vma.lock (vma_adjust) @@ -95,11 +95,11 @@ * ->page_table_lock or pte_lock * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) - * ->tree_lock (try_to_unmap_one) + * ->i_pages lock (try_to_unmap_one) * ->zone_lru_lock(zone) (follow_page->mark_page_accessed) * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) - * ->tree_lock (page_remove_rmap->set_page_dirty) + * ->i_pages lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) @@ -118,14 +118,15 @@ static int page_cache_tree_insert(struct address_space *mapping, void **slot; int error; - error = __radix_tree_create(&mapping->page_tree, page->index, 0, + error = __radix_tree_create(&mapping->i_pages, page->index, 0, &node, &slot); if (error) return error; if (*slot) { void *p; - p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + p = radix_tree_deref_slot_protected(slot, + &mapping->i_pages.xa_lock); if (!radix_tree_exceptional_entry(p)) return -EEXIST; @@ -133,7 +134,7 @@ static int page_cache_tree_insert(struct address_space *mapping, if (shadowp) *shadowp = p; } - __radix_tree_replace(&mapping->page_tree, node, slot, page, + __radix_tree_replace(&mapping->i_pages, node, slot, page, workingset_lookup_update(mapping)); mapping->nrpages++; return 0; @@ -155,13 +156,13 @@ static void page_cache_tree_delete(struct address_space *mapping, struct radix_tree_node *node; void **slot; - __radix_tree_lookup(&mapping->page_tree, page->index + i, + __radix_tree_lookup(&mapping->i_pages, page->index + i, &node, &slot); VM_BUG_ON_PAGE(!node && nr != 1, page); - radix_tree_clear_tags(&mapping->page_tree, node, slot); - __radix_tree_replace(&mapping->page_tree, node, slot, shadow, + radix_tree_clear_tags(&mapping->i_pages, node, slot); + __radix_tree_replace(&mapping->i_pages, node, slot, shadow, workingset_lookup_update(mapping)); } @@ -253,7 +254,7 @@ static void unaccount_page_cache_page(struct address_space *mapping, /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock. + * is safe. The caller must hold the i_pages lock. */ void __delete_from_page_cache(struct page *page, void *shadow) { @@ -296,9 +297,9 @@ void delete_from_page_cache(struct page *page) unsigned long flags; BUG_ON(!PageLocked(page)); - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); __delete_from_page_cache(page, NULL); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); page_cache_free_page(mapping, page); } @@ -309,14 +310,14 @@ EXPORT_SYMBOL(delete_from_page_cache); * @mapping: the mapping to which pages belong * @pvec: pagevec with pages to delete * - * The function walks over mapping->page_tree and removes pages passed in @pvec - * from the radix tree. The function expects @pvec to be sorted by page index. - * It tolerates holes in @pvec (radix tree entries at those indices are not + * The function walks over mapping->i_pages and removes pages passed in @pvec + * from the mapping. The function expects @pvec to be sorted by page index. + * It tolerates holes in @pvec (mapping entries at those indices are not * modified). The function expects only THP head pages to be present in the - * @pvec and takes care to delete all corresponding tail pages from the radix - * tree as well. + * @pvec and takes care to delete all corresponding tail pages from the + * mapping as well. * - * The function expects mapping->tree_lock to be held. + * The function expects the i_pages lock to be held. */ static void page_cache_tree_delete_batch(struct address_space *mapping, @@ -330,11 +331,11 @@ page_cache_tree_delete_batch(struct address_space *mapping, pgoff_t start; start = pvec->pages[0]->index; - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (i >= pagevec_count(pvec) && !tail_pages) break; page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (radix_tree_exceptional_entry(page)) continue; if (!tail_pages) { @@ -357,8 +358,8 @@ page_cache_tree_delete_batch(struct address_space *mapping, } else { tail_pages--; } - radix_tree_clear_tags(&mapping->page_tree, iter.node, slot); - __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL, + radix_tree_clear_tags(&mapping->i_pages, iter.node, slot); + __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL, workingset_lookup_update(mapping)); total_pages++; } @@ -374,14 +375,14 @@ void delete_from_page_cache_batch(struct address_space *mapping, if (!pagevec_count(pvec)) return; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); for (i = 0; i < pagevec_count(pvec); i++) { trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); unaccount_page_cache_page(mapping, pvec->pages[i]); } page_cache_tree_delete_batch(mapping, pvec); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); for (i = 0; i < pagevec_count(pvec); i++) page_cache_free_page(mapping, pvec->pages[i]); @@ -798,7 +799,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); __delete_from_page_cache(old, NULL); error = page_cache_tree_insert(mapping, new, NULL); BUG_ON(error); @@ -810,7 +811,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) __inc_node_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(new)) __inc_node_page_state(new, NR_SHMEM); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); mem_cgroup_migrate(old, new); radix_tree_preload_end(); if (freepage) @@ -852,7 +853,7 @@ static int __add_to_page_cache_locked(struct page *page, page->mapping = mapping; page->index = offset; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); error = page_cache_tree_insert(mapping, page, shadowp); radix_tree_preload_end(); if (unlikely(error)) @@ -861,7 +862,7 @@ static int __add_to_page_cache_locked(struct page *page, /* hugetlb pages do not participate in page cache accounting. */ if (!huge) __inc_node_page_state(page, NR_FILE_PAGES); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); @@ -869,7 +870,7 @@ static int __add_to_page_cache_locked(struct page *page, err_insert: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_cancel_charge(page, memcg, false); put_page(page); @@ -1353,7 +1354,7 @@ pgoff_t page_cache_next_hole(struct address_space *mapping, for (i = 0; i < max_scan; i++) { struct page *page; - page = radix_tree_lookup(&mapping->page_tree, index); + page = radix_tree_lookup(&mapping->i_pages, index); if (!page || radix_tree_exceptional_entry(page)) break; index++; @@ -1394,7 +1395,7 @@ pgoff_t page_cache_prev_hole(struct address_space *mapping, for (i = 0; i < max_scan; i++) { struct page *page; - page = radix_tree_lookup(&mapping->page_tree, index); + page = radix_tree_lookup(&mapping->i_pages, index); if (!page || radix_tree_exceptional_entry(page)) break; index--; @@ -1427,7 +1428,7 @@ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) rcu_read_lock(); repeat: page = NULL; - pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); + pagep = radix_tree_lookup_slot(&mapping->i_pages, offset); if (pagep) { page = radix_tree_deref_slot(pagep); if (unlikely(!page)) @@ -1633,7 +1634,7 @@ unsigned find_get_entries(struct address_space *mapping, return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); @@ -1710,7 +1711,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) { struct page *head, *page; if (iter.index > end) @@ -1795,7 +1796,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, return 0; rcu_read_lock(); - radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { + radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) { struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); @@ -1875,8 +1876,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->page_tree, - &iter, *index, tag) { + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) { struct page *head, *page; if (iter.index > end) @@ -1969,8 +1969,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->page_tree, - &iter, start, tag) { + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) { struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); @@ -2624,8 +2623,7 @@ void filemap_map_pages(struct vm_fault *vmf, struct page *head, *page; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, - start_pgoff) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) { if (iter.index > end_pgoff) break; repeat: @@ -160,6 +160,32 @@ static void hmm_invalidate_range(struct hmm *hmm, up_read(&hmm->mirrors_sem); } +static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) +{ + struct hmm_mirror *mirror; + struct hmm *hmm = mm->hmm; + + down_write(&hmm->mirrors_sem); + mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, + list); + while (mirror) { + list_del_init(&mirror->list); + if (mirror->ops->release) { + /* + * Drop mirrors_sem so callback can wait on any pending + * work that might itself trigger mmu_notifier callback + * and thus would deadlock with us. + */ + up_write(&hmm->mirrors_sem); + mirror->ops->release(mirror); + down_write(&hmm->mirrors_sem); + } + mirror = list_first_entry_or_null(&hmm->mirrors, + struct hmm_mirror, list); + } + up_write(&hmm->mirrors_sem); +} + static void hmm_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, @@ -185,6 +211,7 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn, } static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { + .release = hmm_release, .invalidate_range_start = hmm_invalidate_range_start, .invalidate_range_end = hmm_invalidate_range_end, }; @@ -206,13 +233,24 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) if (!mm || !mirror || !mirror->ops) return -EINVAL; +again: mirror->hmm = hmm_register(mm); if (!mirror->hmm) return -ENOMEM; down_write(&mirror->hmm->mirrors_sem); - list_add(&mirror->list, &mirror->hmm->mirrors); - up_write(&mirror->hmm->mirrors_sem); + if (mirror->hmm->mm == NULL) { + /* + * A racing hmm_mirror_unregister() is about to destroy the hmm + * struct. Try again to allocate a new one. + */ + up_write(&mirror->hmm->mirrors_sem); + mirror->hmm = NULL; + goto again; + } else { + list_add(&mirror->list, &mirror->hmm->mirrors); + up_write(&mirror->hmm->mirrors_sem); + } return 0; } @@ -227,11 +265,32 @@ EXPORT_SYMBOL(hmm_mirror_register); */ void hmm_mirror_unregister(struct hmm_mirror *mirror) { - struct hmm *hmm = mirror->hmm; + bool should_unregister = false; + struct mm_struct *mm; + struct hmm *hmm; + if (mirror->hmm == NULL) + return; + + hmm = mirror->hmm; down_write(&hmm->mirrors_sem); - list_del(&mirror->list); + list_del_init(&mirror->list); + should_unregister = list_empty(&hmm->mirrors); + mirror->hmm = NULL; + mm = hmm->mm; + hmm->mm = NULL; up_write(&hmm->mirrors_sem); + + if (!should_unregister || mm == NULL) + return; + + spin_lock(&mm->page_table_lock); + if (mm->hmm == hmm) + mm->hmm = NULL; + spin_unlock(&mm->page_table_lock); + + mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); + kfree(hmm); } EXPORT_SYMBOL(hmm_mirror_unregister); @@ -240,110 +299,275 @@ struct hmm_vma_walk { unsigned long last; bool fault; bool block; - bool write; }; -static int hmm_vma_do_fault(struct mm_walk *walk, - unsigned long addr, - hmm_pfn_t *pfn) +static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, + bool write_fault, uint64_t *pfn) { unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; int r; flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; - flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0; + flags |= write_fault ? FAULT_FLAG_WRITE : 0; r = handle_mm_fault(vma, addr, flags); if (r & VM_FAULT_RETRY) return -EBUSY; if (r & VM_FAULT_ERROR) { - *pfn = HMM_PFN_ERROR; + *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; } return -EAGAIN; } -static void hmm_pfns_special(hmm_pfn_t *pfns, - unsigned long addr, - unsigned long end) -{ - for (; addr < end; addr += PAGE_SIZE, pfns++) - *pfns = HMM_PFN_SPECIAL; -} - static int hmm_pfns_bad(unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct hmm_range *range = walk->private; - hmm_pfn_t *pfns = range->pfns; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + uint64_t *pfns = range->pfns; unsigned long i; i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) - pfns[i] = HMM_PFN_ERROR; + pfns[i] = range->values[HMM_PFN_ERROR]; return 0; } -static void hmm_pfns_clear(hmm_pfn_t *pfns, - unsigned long addr, - unsigned long end) -{ - for (; addr < end; addr += PAGE_SIZE, pfns++) - *pfns = 0; -} - -static int hmm_vma_walk_hole(unsigned long addr, - unsigned long end, - struct mm_walk *walk) +/* + * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) + * @start: range virtual start address (inclusive) + * @end: range virtual end address (exclusive) + * @fault: should we fault or not ? + * @write_fault: write fault ? + * @walk: mm_walk structure + * Returns: 0 on success, -EAGAIN after page fault, or page fault error + * + * This function will be called whenever pmd_none() or pte_none() returns true, + * or whenever there is no page directory covering the virtual address range. + */ +static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, + bool fault, bool write_fault, + struct mm_walk *walk) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - hmm_pfn_t *pfns = range->pfns; + uint64_t *pfns = range->pfns; unsigned long i; hmm_vma_walk->last = addr; i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) { - pfns[i] = HMM_PFN_EMPTY; - if (hmm_vma_walk->fault) { + pfns[i] = range->values[HMM_PFN_NONE]; + if (fault || write_fault) { int ret; - ret = hmm_vma_do_fault(walk, addr, &pfns[i]); + ret = hmm_vma_do_fault(walk, addr, write_fault, + &pfns[i]); if (ret != -EAGAIN) return ret; } } - return hmm_vma_walk->fault ? -EAGAIN : 0; + return (fault || write_fault) ? -EAGAIN : 0; } -static int hmm_vma_walk_clear(unsigned long addr, - unsigned long end, - struct mm_walk *walk) +static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + uint64_t pfns, uint64_t cpu_flags, + bool *fault, bool *write_fault) { - struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - hmm_pfn_t *pfns = range->pfns; + + *fault = *write_fault = false; + if (!hmm_vma_walk->fault) + return; + + /* We aren't ask to do anything ... */ + if (!(pfns & range->flags[HMM_PFN_VALID])) + return; + /* If this is device memory than only fault if explicitly requested */ + if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { + /* Do we fault on device memory ? */ + if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { + *write_fault = pfns & range->flags[HMM_PFN_WRITE]; + *fault = true; + } + return; + } + + /* If CPU page table is not valid then we need to fault */ + *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); + /* Need to write fault ? */ + if ((pfns & range->flags[HMM_PFN_WRITE]) && + !(cpu_flags & range->flags[HMM_PFN_WRITE])) { + *write_fault = true; + *fault = true; + } +} + +static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + const uint64_t *pfns, unsigned long npages, + uint64_t cpu_flags, bool *fault, + bool *write_fault) +{ unsigned long i; - hmm_vma_walk->last = addr; + if (!hmm_vma_walk->fault) { + *fault = *write_fault = false; + return; + } + + for (i = 0; i < npages; ++i) { + hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, + fault, write_fault); + if ((*fault) || (*write_fault)) + return; + } +} + +static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + bool fault, write_fault; + unsigned long i, npages; + uint64_t *pfns; + i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) { - pfns[i] = 0; - if (hmm_vma_walk->fault) { - int ret; + npages = (end - addr) >> PAGE_SHIFT; + pfns = &range->pfns[i]; + hmm_range_need_fault(hmm_vma_walk, pfns, npages, + 0, &fault, &write_fault); + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); +} - ret = hmm_vma_do_fault(walk, addr, &pfns[i]); - if (ret != -EAGAIN) - return ret; +static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) +{ + if (pmd_protnone(pmd)) + return 0; + return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_WRITE] : + range->flags[HMM_PFN_VALID]; +} + +static int hmm_vma_handle_pmd(struct mm_walk *walk, + unsigned long addr, + unsigned long end, + uint64_t *pfns, + pmd_t pmd) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + unsigned long pfn, npages, i; + bool fault, write_fault; + uint64_t cpu_flags; + + npages = (end - addr) >> PAGE_SHIFT; + cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); + hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, + &fault, &write_fault); + + if (pmd_protnone(pmd) || fault || write_fault) + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); + + pfn = pmd_pfn(pmd) + pte_index(addr); + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) + pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + hmm_vma_walk->last = end; + return 0; +} + +static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) +{ + if (pte_none(pte) || !pte_present(pte)) + return 0; + return pte_write(pte) ? range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_WRITE] : + range->flags[HMM_PFN_VALID]; +} + +static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, + unsigned long end, pmd_t *pmdp, pte_t *ptep, + uint64_t *pfn) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; + bool fault, write_fault; + uint64_t cpu_flags; + pte_t pte = *ptep; + uint64_t orig_pfn = *pfn; + + *pfn = range->values[HMM_PFN_NONE]; + cpu_flags = pte_to_hmm_pfn_flags(range, pte); + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); + + if (pte_none(pte)) { + if (fault || write_fault) + goto fault; + return 0; + } + + if (!pte_present(pte)) { + swp_entry_t entry = pte_to_swp_entry(pte); + + if (!non_swap_entry(entry)) { + if (fault || write_fault) + goto fault; + return 0; } + + /* + * This is a special swap entry, ignore migration, use + * device and report anything else as error. + */ + if (is_device_private_entry(entry)) { + cpu_flags = range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_DEVICE_PRIVATE]; + cpu_flags |= is_write_device_private_entry(entry) ? + range->flags[HMM_PFN_WRITE] : 0; + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); + if (fault || write_fault) + goto fault; + *pfn = hmm_pfn_from_pfn(range, swp_offset(entry)); + *pfn |= cpu_flags; + return 0; + } + + if (is_migration_entry(entry)) { + if (fault || write_fault) { + pte_unmap(ptep); + hmm_vma_walk->last = addr; + migration_entry_wait(vma->vm_mm, + pmdp, addr); + return -EAGAIN; + } + return 0; + } + + /* Report error for everything else */ + *pfn = range->values[HMM_PFN_ERROR]; + return -EFAULT; } - return hmm_vma_walk->fault ? -EAGAIN : 0; + if (fault || write_fault) + goto fault; + + *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; + return 0; + +fault: + pte_unmap(ptep); + /* Fault any virtual address we were asked to fault */ + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); } static int hmm_vma_walk_pmd(pmd_t *pmdp, @@ -353,26 +577,20 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - struct vm_area_struct *vma = walk->vma; - hmm_pfn_t *pfns = range->pfns; + uint64_t *pfns = range->pfns; unsigned long addr = start, i; - bool write_fault; - hmm_pfn_t flag; pte_t *ptep; i = (addr - range->start) >> PAGE_SHIFT; - flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0; - write_fault = hmm_vma_walk->fault & hmm_vma_walk->write; again: if (pmd_none(*pmdp)) return hmm_vma_walk_hole(start, end, walk); - if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB) + if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB)) return hmm_pfns_bad(start, end, walk); if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) { - unsigned long pfn; pmd_t pmd; /* @@ -388,17 +606,8 @@ again: barrier(); if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; - if (pmd_protnone(pmd)) - return hmm_vma_walk_clear(start, end, walk); - if (write_fault && !pmd_write(pmd)) - return hmm_vma_walk_clear(start, end, walk); - - pfn = pmd_pfn(pmd) + pte_index(addr); - flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; - for (; addr < end; addr += PAGE_SIZE, i++, pfn++) - pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag; - return 0; + return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); } if (pmd_bad(*pmdp)) @@ -406,79 +615,43 @@ again: ptep = pte_offset_map(pmdp, addr); for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { - pte_t pte = *ptep; - - pfns[i] = 0; + int r; - if (pte_none(pte)) { - pfns[i] = HMM_PFN_EMPTY; - if (hmm_vma_walk->fault) - goto fault; - continue; + r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); + if (r) { + /* hmm_vma_handle_pte() did unmap pte directory */ + hmm_vma_walk->last = addr; + return r; } - - if (!pte_present(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); - - if (!non_swap_entry(entry)) { - if (hmm_vma_walk->fault) - goto fault; - continue; - } - - /* - * This is a special swap entry, ignore migration, use - * device and report anything else as error. - */ - if (is_device_private_entry(entry)) { - pfns[i] = hmm_pfn_t_from_pfn(swp_offset(entry)); - if (is_write_device_private_entry(entry)) { - pfns[i] |= HMM_PFN_WRITE; - } else if (write_fault) - goto fault; - pfns[i] |= HMM_PFN_DEVICE_UNADDRESSABLE; - pfns[i] |= flag; - } else if (is_migration_entry(entry)) { - if (hmm_vma_walk->fault) { - pte_unmap(ptep); - hmm_vma_walk->last = addr; - migration_entry_wait(vma->vm_mm, - pmdp, addr); - return -EAGAIN; - } - continue; - } else { - /* Report error for everything else */ - pfns[i] = HMM_PFN_ERROR; - } - continue; - } - - if (write_fault && !pte_write(pte)) - goto fault; - - pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag; - pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0; - continue; - -fault: - pte_unmap(ptep); - /* Fault all pages in range */ - return hmm_vma_walk_clear(start, end, walk); } pte_unmap(ptep - 1); + hmm_vma_walk->last = addr; return 0; } +static void hmm_pfns_clear(struct hmm_range *range, + uint64_t *pfns, + unsigned long addr, + unsigned long end) +{ + for (; addr < end; addr += PAGE_SIZE, pfns++) + *pfns = range->values[HMM_PFN_NONE]; +} + +static void hmm_pfns_special(struct hmm_range *range) +{ + unsigned long addr = range->start, i = 0; + + for (; addr < range->end; addr += PAGE_SIZE, i++) + range->pfns[i] = range->values[HMM_PFN_SPECIAL]; +} + /* * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses - * @vma: virtual memory area containing the virtual address range - * @range: used to track snapshot validity - * @start: range virtual start address (inclusive) - * @end: range virtual end address (exclusive) - * @entries: array of hmm_pfn_t: provided by the caller, filled in by function - * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success + * @range: range being snapshotted + * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid + * vma permission, 0 success * * This snapshots the CPU page table for a range of virtual addresses. Snapshot * validity is tracked by range struct. See hmm_vma_range_done() for further @@ -491,26 +664,17 @@ fault: * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! */ -int hmm_vma_get_pfns(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns) +int hmm_vma_get_pfns(struct hmm_range *range) { + struct vm_area_struct *vma = range->vma; struct hmm_vma_walk hmm_vma_walk; struct mm_walk mm_walk; struct hmm *hmm; - /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { - hmm_pfns_special(pfns, start, end); - return -EINVAL; - } - /* Sanity check, this really should not happen ! */ - if (start < vma->vm_start || start >= vma->vm_end) + if (range->start < vma->vm_start || range->start >= vma->vm_end) return -EINVAL; - if (end < vma->vm_start || end > vma->vm_end) + if (range->end < vma->vm_start || range->end > vma->vm_end) return -EINVAL; hmm = hmm_register(vma->vm_mm); @@ -520,10 +684,24 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma, if (!hmm->mmu_notifier.ops) return -EINVAL; + /* FIXME support hugetlb fs */ + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + hmm_pfns_special(range); + return -EINVAL; + } + + if (!(vma->vm_flags & VM_READ)) { + /* + * If vma do not allow read access, then assume that it does + * not allow write access, either. Architecture that allow + * write without read access are not supported by HMM, because + * operations such has atomic access would not work. + */ + hmm_pfns_clear(range, range->pfns, range->start, range->end); + return -EPERM; + } + /* Initialize range to track CPU page table update */ - range->start = start; - range->pfns = pfns; - range->end = end; spin_lock(&hmm->lock); range->valid = true; list_add_rcu(&range->list, &hmm->ranges); @@ -541,14 +719,13 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma, mm_walk.pmd_entry = hmm_vma_walk_pmd; mm_walk.pte_hole = hmm_vma_walk_hole; - walk_page_range(start, end, &mm_walk); + walk_page_range(range->start, range->end, &mm_walk); return 0; } EXPORT_SYMBOL(hmm_vma_get_pfns); /* * hmm_vma_range_done() - stop tracking change to CPU page table over a range - * @vma: virtual memory area containing the virtual address range * @range: range being tracked * Returns: false if range data has been invalidated, true otherwise * @@ -568,10 +745,10 @@ EXPORT_SYMBOL(hmm_vma_get_pfns); * * There are two ways to use this : * again: - * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...); + * hmm_vma_get_pfns(range); or hmm_vma_fault(...); * trans = device_build_page_table_update_transaction(pfns); * device_page_table_lock(); - * if (!hmm_vma_range_done(vma, range)) { + * if (!hmm_vma_range_done(range)) { * device_page_table_unlock(); * goto again; * } @@ -579,13 +756,13 @@ EXPORT_SYMBOL(hmm_vma_get_pfns); * device_page_table_unlock(); * * Or: - * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...); + * hmm_vma_get_pfns(range); or hmm_vma_fault(...); * device_page_table_lock(); - * hmm_vma_range_done(vma, range); - * device_update_page_table(pfns); + * hmm_vma_range_done(range); + * device_update_page_table(range->pfns); * device_page_table_unlock(); */ -bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range) +bool hmm_vma_range_done(struct hmm_range *range) { unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; struct hmm *hmm; @@ -595,7 +772,7 @@ bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range) return false; } - hmm = hmm_register(vma->vm_mm); + hmm = hmm_register(range->vma->vm_mm); if (!hmm) { memset(range->pfns, 0, sizeof(*range->pfns) * npages); return false; @@ -611,36 +788,34 @@ EXPORT_SYMBOL(hmm_vma_range_done); /* * hmm_vma_fault() - try to fault some address in a virtual address range - * @vma: virtual memory area containing the virtual address range - * @range: use to track pfns array content validity - * @start: fault range virtual start address (inclusive) - * @end: fault range virtual end address (exclusive) - * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted - * @write: is it a write fault + * @range: range being faulted * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) * * This is similar to a regular CPU page fault except that it will not trigger * any memory migration if the memory being faulted is not accessible by CPUs. * - * On error, for one virtual address in the range, the function will set the - * hmm_pfn_t error flag for the corresponding pfn entry. + * On error, for one virtual address in the range, the function will mark the + * corresponding HMM pfn entry with an error flag. * * Expected use pattern: * retry: * down_read(&mm->mmap_sem); * // Find vma and address device wants to fault, initialize hmm_pfn_t * // array accordingly - * ret = hmm_vma_fault(vma, start, end, pfns, allow_retry); + * ret = hmm_vma_fault(range, write, block); * switch (ret) { * case -EAGAIN: - * hmm_vma_range_done(vma, range); + * hmm_vma_range_done(range); * // You might want to rate limit or yield to play nicely, you may * // also commit any valid pfn in the array assuming that you are * // getting true from hmm_vma_range_monitor_end() * goto retry; * case 0: * break; + * case -ENOMEM: + * case -EINVAL: + * case -EPERM: * default: * // Handle error ! * up_read(&mm->mmap_sem) @@ -648,7 +823,7 @@ EXPORT_SYMBOL(hmm_vma_range_done); * } * // Take device driver lock that serialize device page table update * driver_lock_device_page_table_update(); - * hmm_vma_range_done(vma, range); + * hmm_vma_range_done(range); * // Commit pfns we got from hmm_vma_fault() * driver_unlock_device_page_table_update(); * up_read(&mm->mmap_sem) @@ -658,51 +833,54 @@ EXPORT_SYMBOL(hmm_vma_range_done); * * YOU HAVE BEEN WARNED ! */ -int hmm_vma_fault(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns, - bool write, - bool block) +int hmm_vma_fault(struct hmm_range *range, bool block) { + struct vm_area_struct *vma = range->vma; + unsigned long start = range->start; struct hmm_vma_walk hmm_vma_walk; struct mm_walk mm_walk; struct hmm *hmm; int ret; /* Sanity check, this really should not happen ! */ - if (start < vma->vm_start || start >= vma->vm_end) + if (range->start < vma->vm_start || range->start >= vma->vm_end) return -EINVAL; - if (end < vma->vm_start || end > vma->vm_end) + if (range->end < vma->vm_start || range->end > vma->vm_end) return -EINVAL; hmm = hmm_register(vma->vm_mm); if (!hmm) { - hmm_pfns_clear(pfns, start, end); + hmm_pfns_clear(range, range->pfns, range->start, range->end); return -ENOMEM; } /* Caller must have registered a mirror using hmm_mirror_register() */ if (!hmm->mmu_notifier.ops) return -EINVAL; + /* FIXME support hugetlb fs */ + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + hmm_pfns_special(range); + return -EINVAL; + } + + if (!(vma->vm_flags & VM_READ)) { + /* + * If vma do not allow read access, then assume that it does + * not allow write access, either. Architecture that allow + * write without read access are not supported by HMM, because + * operations such has atomic access would not work. + */ + hmm_pfns_clear(range, range->pfns, range->start, range->end); + return -EPERM; + } + /* Initialize range to track CPU page table update */ - range->start = start; - range->pfns = pfns; - range->end = end; spin_lock(&hmm->lock); range->valid = true; list_add_rcu(&range->list, &hmm->ranges); spin_unlock(&hmm->lock); - /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { - hmm_pfns_special(pfns, start, end); - return 0; - } - hmm_vma_walk.fault = true; - hmm_vma_walk.write = write; hmm_vma_walk.block = block; hmm_vma_walk.range = range; mm_walk.private = &hmm_vma_walk; @@ -717,7 +895,7 @@ int hmm_vma_fault(struct vm_area_struct *vma, mm_walk.pte_hole = hmm_vma_walk_hole; do { - ret = walk_page_range(start, end, &mm_walk); + ret = walk_page_range(start, range->end, &mm_walk); start = hmm_vma_walk.last; } while (ret == -EAGAIN); @@ -725,8 +903,9 @@ int hmm_vma_fault(struct vm_area_struct *vma, unsigned long i; i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; - hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end); - hmm_vma_range_done(vma, range); + hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last, + range->end); + hmm_vma_range_done(range); } return ret; } @@ -845,13 +1024,6 @@ static void hmm_devmem_release(struct device *dev, void *data) hmm_devmem_radix_release(resource); } -static struct hmm_devmem *hmm_devmem_find(resource_size_t phys) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - return radix_tree_lookup(&hmm_devmem_radix, phys >> PA_SECTION_SHIFT); -} - static int hmm_devmem_pages_create(struct hmm_devmem *devmem) { resource_size_t key, align_start, align_size, align_end; @@ -892,9 +1064,8 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) { struct hmm_devmem *dup; - rcu_read_lock(); - dup = hmm_devmem_find(key); - rcu_read_unlock(); + dup = radix_tree_lookup(&hmm_devmem_radix, + key >> PA_SECTION_SHIFT); if (dup) { dev_err(device, "%s: collides with mapping for %s\n", __func__, dev_name(dup->device)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f0ae8d1d4329..14ed6ee5e02f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -555,8 +555,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg, - true)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1317,7 +1316,7 @@ alloc: } if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, - huge_gfp | __GFP_NORETRY, &memcg, true))) { + huge_gfp, &memcg, true))) { put_page(new_page); split_huge_pmd(vma, vmf->pmd, vmf->address); if (page) @@ -2402,6 +2401,12 @@ static void __split_huge_page_tail(struct page *head, int tail, page_tail->index = head->index + tail; page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); + + /* + * always add to the tail because some iterators expect new + * pages to show after the currently processed elements - e.g. + * migrate_pages + */ lru_add_page_tail(head, page_tail, lruvec, list); } @@ -2445,7 +2450,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } else { /* Additional pin to radix tree */ page_ref_add(head, 2); - spin_unlock(&head->mapping->tree_lock); + xa_unlock(&head->mapping->i_pages); } spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); @@ -2653,15 +2658,15 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) { void **pslot; - spin_lock(&mapping->tree_lock); - pslot = radix_tree_lookup_slot(&mapping->page_tree, + xa_lock(&mapping->i_pages); + pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(head)); /* * Check if the head page is present in radix tree. * We assume all tail are present too, if head is there. */ if (radix_tree_deref_slot_protected(pslot, - &mapping->tree_lock) != head) + &mapping->i_pages.xa_lock) != head) goto fail; } @@ -2695,7 +2700,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } spin_unlock(&pgdata->split_queue_lock); fail: if (mapping) - spin_unlock(&mapping->tree_lock); + xa_unlock(&mapping->i_pages); spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); unfreeze_page(head); ret = -EBUSY; diff --git a/mm/internal.h b/mm/internal.h index e6bd35182dae..62d8c34e63d5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -168,6 +168,9 @@ extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; +extern void set_zone_contiguous(struct zone *zone); +extern void clear_zone_contiguous(struct zone *zone); + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* @@ -495,7 +498,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HARDER 0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ -#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ enum ttu_flags; struct tlbflush_unmap_batch; @@ -538,4 +540,5 @@ static inline bool is_migrate_highatomic_page(struct page *page) } void setup_zone_pageset(struct zone *zone); +extern struct page *alloc_new_node_page(struct page *page, unsigned long node); #endif /* __MM_INTERNAL_H */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e42568284e06..d7b2a4bf8671 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -965,9 +965,7 @@ static void collapse_huge_page(struct mm_struct *mm, goto out_nolock; } - /* Do not oom kill for khugepaged charges */ - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY, - &memcg, true))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } @@ -1326,9 +1324,7 @@ static void collapse_shmem(struct mm_struct *mm, goto out; } - /* Do not oom kill for khugepaged charges */ - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY, - &memcg, true))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out; } @@ -1348,8 +1344,8 @@ static void collapse_shmem(struct mm_struct *mm, */ index = start; - spin_lock_irq(&mapping->tree_lock); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + xa_lock_irq(&mapping->i_pages); + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { int n = min(iter.index, end) - index; /* @@ -1362,7 +1358,7 @@ static void collapse_shmem(struct mm_struct *mm, } nr_none += n; for (; index < min(iter.index, end); index++) { - radix_tree_insert(&mapping->page_tree, index, + radix_tree_insert(&mapping->i_pages, index, new_page + (index % HPAGE_PMD_NR)); } @@ -1371,16 +1367,16 @@ static void collapse_shmem(struct mm_struct *mm, break; page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* swap in or instantiate fallocated page */ if (shmem_getpage(mapping->host, index, &page, SGP_NOHUGE)) { result = SCAN_FAIL; goto tree_unlocked; } - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } else if (trylock_page(page)) { get_page(page); } else { @@ -1389,7 +1385,7 @@ static void collapse_shmem(struct mm_struct *mm, } /* - * The page must be locked, so we can drop the tree_lock + * The page must be locked, so we can drop the i_pages lock * without racing with truncate. */ VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -1400,7 +1396,7 @@ static void collapse_shmem(struct mm_struct *mm, result = SCAN_TRUNCATED; goto out_unlock; } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); if (isolate_lru_page(page)) { result = SCAN_DEL_PAGE_LRU; @@ -1410,11 +1406,11 @@ static void collapse_shmem(struct mm_struct *mm, if (page_mapped(page)) unmap_mapping_pages(mapping, index, 1, false); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); - slot = radix_tree_lookup_slot(&mapping->page_tree, index); + slot = radix_tree_lookup_slot(&mapping->i_pages, index); VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot, - &mapping->tree_lock), page); + &mapping->i_pages.xa_lock), page); VM_BUG_ON_PAGE(page_mapped(page), page); /* @@ -1435,14 +1431,14 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - radix_tree_replace_slot(&mapping->page_tree, slot, + radix_tree_replace_slot(&mapping->i_pages, slot, new_page + (index % HPAGE_PMD_NR)); slot = radix_tree_iter_resume(slot, &iter); index++; continue; out_lru: - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); putback_lru_page(page); out_isolate_failed: unlock_page(page); @@ -1468,14 +1464,14 @@ out_unlock: } for (; index < end; index++) { - radix_tree_insert(&mapping->page_tree, index, + radix_tree_insert(&mapping->i_pages, index, new_page + (index % HPAGE_PMD_NR)); } nr_none += n; } tree_locked: - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); tree_unlocked: if (result == SCAN_SUCCEED) { @@ -1524,9 +1520,8 @@ tree_unlocked: } else { /* Something went wrong: rollback changes to the radix-tree */ shmem_uncharge(mapping->host, nr_none); - spin_lock_irq(&mapping->tree_lock); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, - start) { + xa_lock_irq(&mapping->i_pages); + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (iter.index >= end) break; page = list_first_entry_or_null(&pagelist, @@ -1536,8 +1531,7 @@ tree_unlocked: break; nr_none--; /* Put holes back where they were */ - radix_tree_delete(&mapping->page_tree, - iter.index); + radix_tree_delete(&mapping->i_pages, iter.index); continue; } @@ -1546,16 +1540,15 @@ tree_unlocked: /* Unfreeze the page. */ list_del(&page->lru); page_ref_unfreeze(page, 2); - radix_tree_replace_slot(&mapping->page_tree, - slot, page); + radix_tree_replace_slot(&mapping->i_pages, slot, page); slot = radix_tree_iter_resume(slot, &iter); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); putback_lru_page(page); unlock_page(page); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } VM_BUG_ON(nr_none); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* Unfreeze new_page, caller would take care about freeing it */ page_ref_unfreeze(new_page, 1); @@ -1583,7 +1576,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, swap = 0; memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (iter.index >= start + HPAGE_PMD_NR) break; @@ -1883,8 +1876,16 @@ static void set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; - for_each_populated_zone(zone) + for_each_populated_zone(zone) { + /* + * We don't need to worry about fragmentation of + * ZONE_MOVABLE since it only has movable pages. + */ + if (zone_idx(zone) > gfp_zone(GFP_USER)) + continue; + nr_zones++; + } /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ recommended_min = pageblock_nr_pages * nr_zones * 2; @@ -1131,6 +1131,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, } else { newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)); + /* + * We're replacing an anonymous page with a zero page, which is + * not anonymous. We need to do proper accounting otherwise we + * will get wrong values in /proc, and a BUG message in dmesg + * when tearing down the mm. + */ + dec_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, addr, pte_pfn(*ptep)); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9ec024b862ac..e074f7c637aa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1485,7 +1485,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_may_oom) + if (!current->memcg_may_oom || order > PAGE_ALLOC_COSTLY_ORDER) return; /* * We are in the middle of the charge context here, so we @@ -1839,7 +1839,7 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) } } - for (i = 0; i < MEMCG_NR_EVENTS; i++) { + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { long x; x = this_cpu_xchg(memcg->stat_cpu->events[i], 0); @@ -1858,7 +1858,7 @@ static void reclaim_high(struct mem_cgroup *memcg, do { if (page_counter_read(&memcg->memory) <= memcg->high) continue; - mem_cgroup_event(memcg, MEMCG_HIGH); + memcg_memory_event(memcg, MEMCG_HIGH); try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); } while ((memcg = parent_mem_cgroup(memcg))); } @@ -1949,7 +1949,7 @@ retry: if (!gfpflags_allow_blocking(gfp_mask)) goto nomem; - mem_cgroup_event(mem_over_limit, MEMCG_MAX); + memcg_memory_event(mem_over_limit, MEMCG_MAX); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, may_swap); @@ -1992,7 +1992,7 @@ retry: if (fatal_signal_pending(current)) goto force; - mem_cgroup_event(mem_over_limit, MEMCG_OOM); + memcg_memory_event(mem_over_limit, MEMCG_OOM); mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); @@ -2688,10 +2688,10 @@ static void tree_events(struct mem_cgroup *memcg, unsigned long *events) struct mem_cgroup *iter; int i; - memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS); + memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS); for_each_mem_cgroup_tree(iter, memcg) { - for (i = 0; i < MEMCG_NR_EVENTS; i++) + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) events[i] += memcg_sum_events(iter, i); } } @@ -4108,6 +4108,9 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + if (!pn) + return; + free_percpu(pn->lruvec_stat_cpu); kfree(pn); } @@ -5178,7 +5181,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, continue; } - mem_cgroup_event(memcg, MEMCG_OOM); + memcg_memory_event(memcg, MEMCG_OOM); if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) break; } @@ -5191,10 +5194,14 @@ static int memory_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - seq_printf(m, "low %lu\n", memcg_sum_events(memcg, MEMCG_LOW)); - seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH)); - seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX)); - seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM)); + seq_printf(m, "low %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_LOW])); + seq_printf(m, "high %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_HIGH])); + seq_printf(m, "max %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_MAX])); + seq_printf(m, "oom %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_OOM])); seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL)); return 0; @@ -5204,7 +5211,7 @@ static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); unsigned long stat[MEMCG_NR_STAT]; - unsigned long events[MEMCG_NR_EVENTS]; + unsigned long events[NR_VM_EVENT_ITEMS]; int i; /* @@ -5967,9 +5974,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) /* * Interrupts should be disabled here because the caller holds the - * mapping->tree_lock lock which is taken with interrupts-off. It is + * i_pages lock which is taken with interrupts-off. It is * important here to have the interrupts disabled because it is the - * only synchronisation we have for udpating the per-CPU variables. + * only synchronisation we have for updating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2d4bf647cf01..9d142b9b86dc 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1487,7 +1487,7 @@ int unpoison_memory(unsigned long pfn) } EXPORT_SYMBOL(unpoison_memory); -static struct page *new_page(struct page *p, unsigned long private, int **x) +static struct page *new_page(struct page *p, unsigned long private) { int nid = page_to_nid(p); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index cc6dfa5832ca..f74826cdceea 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1329,8 +1329,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) return 0; } -static struct page *new_node_page(struct page *page, unsigned long private, - int **result) +static struct page *new_node_page(struct page *page, unsigned long private) { int nid = page_to_nid(page); nodemask_t nmask = node_states[N_MEMORY]; @@ -1373,7 +1372,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (isolate_huge_page(page, &source)) move_pages -= 1 << compound_order(head); continue; - } else if (thp_migration_supported() && PageTransHuge(page)) + } else if (PageTransHuge(page)) pfn = page_to_pfn(compound_head(page)) + hpage_nr_pages(page) - 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 01cbb7078d6c..9ac49ef17b4e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -446,15 +446,6 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, __split_huge_pmd(walk->vma, pmd, addr, false, NULL); goto out; } - if (!thp_migration_supported()) { - get_page(page); - spin_unlock(ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - goto out; - } if (!queue_pages_required(page, qp)) { ret = 1; goto unlock; @@ -495,7 +486,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; -retry: + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) @@ -511,22 +502,6 @@ retry: continue; if (!queue_pages_required(page, qp)) continue; - if (PageTransCompound(page) && !thp_migration_supported()) { - get_page(page); - pte_unmap_unlock(pte, ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - /* Failed to split -- skip. */ - if (ret) { - pte = pte_offset_map_lock(walk->mm, pmd, - addr, &ptl); - continue; - } - goto retry; - } - migrate_page_add(page, qp->pagelist, flags); } pte_unmap_unlock(pte - 1, ptl); @@ -942,12 +917,13 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, } } -static struct page *new_node_page(struct page *page, unsigned long node, int **x) +/* page allocation callback for NUMA node migration */ +struct page *alloc_new_node_page(struct page *page, unsigned long node) { if (PageHuge(page)) return alloc_huge_page_node(page_hstate(compound_head(page)), node); - else if (thp_migration_supported() && PageTransHuge(page)) { + else if (PageTransHuge(page)) { struct page *thp; thp = alloc_pages_node(node, @@ -986,7 +962,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, NULL, dest, + err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1107,7 +1083,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, * list of pages handed to migrate_pages()--which is how we get here-- * is in virtual address order. */ -static struct page *new_page(struct page *page, unsigned long start, int **x) +static struct page *new_page(struct page *page, unsigned long start) { struct vm_area_struct *vma; unsigned long uninitialized_var(address); @@ -1123,7 +1099,7 @@ static struct page *new_page(struct page *page, unsigned long start, int **x) if (PageHuge(page)) { return alloc_huge_page_vma(page_hstate(compound_head(page)), vma, address); - } else if (thp_migration_supported() && PageTransHuge(page)) { + } else if (PageTransHuge(page)) { struct page *thp; thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, @@ -1152,7 +1128,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct page *new_page(struct page *page, unsigned long start, int **x) +static struct page *new_page(struct page *page, unsigned long start) { return NULL; } diff --git a/mm/migrate.c b/mm/migrate.c index 003886606a22..f65dd69e1fd1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -467,20 +467,21 @@ int migrate_page_move_mapping(struct address_space *mapping, oldzone = page_zone(page); newzone = page_zone(newpage); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); - pslot = radix_tree_lookup_slot(&mapping->page_tree, + pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); expected_count += 1 + page_has_private(page); if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { - spin_unlock_irq(&mapping->tree_lock); + radix_tree_deref_slot_protected(pslot, + &mapping->i_pages.xa_lock) != page) { + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } @@ -494,7 +495,7 @@ int migrate_page_move_mapping(struct address_space *mapping, if (mode == MIGRATE_ASYNC && head && !buffer_migrate_lock_buffers(head, mode)) { page_ref_unfreeze(page, expected_count); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } @@ -522,7 +523,7 @@ int migrate_page_move_mapping(struct address_space *mapping, SetPageDirty(newpage); } - radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); + radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); /* * Drop cache reference from old page by unfreezing @@ -531,7 +532,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ page_ref_unfreeze(page, expected_count - 1); - spin_unlock(&mapping->tree_lock); + xa_unlock(&mapping->i_pages); /* Leave irq disabled to prevent preemption while updating stats */ /* @@ -574,20 +575,19 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, int expected_count; void **pslot; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); - pslot = radix_tree_lookup_slot(&mapping->page_tree, - page_index(page)); + pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { - spin_unlock_irq(&mapping->tree_lock); + radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } @@ -596,11 +596,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, get_page(newpage); - radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); + radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); page_ref_unfreeze(page, expected_count - 1); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return MIGRATEPAGE_SUCCESS; } @@ -1137,10 +1137,12 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, enum migrate_reason reason) { int rc = MIGRATEPAGE_SUCCESS; - int *result = NULL; struct page *newpage; - newpage = get_new_page(page, private, &result); + if (!thp_migration_supported() && PageTransHuge(page)) + return -ENOMEM; + + newpage = get_new_page(page, private); if (!newpage) return -ENOMEM; @@ -1161,14 +1163,6 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, goto out; } - if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) { - lock_page(page); - rc = split_huge_page(page); - unlock_page(page); - if (rc) - goto out; - } - rc = __unmap_and_move(page, newpage, force, mode); if (rc == MIGRATEPAGE_SUCCESS) set_page_owner_migrate_reason(newpage, reason); @@ -1231,12 +1225,6 @@ put_new: put_page(newpage); } - if (result) { - if (rc) - *result = rc; - else - *result = page_to_nid(newpage); - } return rc; } @@ -1264,7 +1252,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, enum migrate_mode mode, int reason) { int rc = -EAGAIN; - int *result = NULL; int page_was_mapped = 0; struct page *new_hpage; struct anon_vma *anon_vma = NULL; @@ -1281,7 +1268,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return -ENOSYS; } - new_hpage = get_new_page(hpage, private, &result); + new_hpage = get_new_page(hpage, private); if (!new_hpage) return -ENOMEM; @@ -1345,12 +1332,6 @@ out: else putback_active_hugepage(new_hpage); - if (result) { - if (rc) - *result = rc; - else - *result = page_to_nid(new_hpage); - } return rc; } @@ -1395,6 +1376,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, retry = 0; list_for_each_entry_safe(page, page2, from, lru) { +retry: cond_resched(); if (PageHuge(page)) @@ -1408,6 +1390,26 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, switch(rc) { case -ENOMEM: + /* + * THP migration might be unsupported or the + * allocation could've failed so we should + * retry on the same page with the THP split + * to base pages. + * + * Head page is retried immediately and tail + * pages are added to the tail of the list so + * we encounter them after the rest of the list + * is processed. + */ + if (PageTransHuge(page)) { + lock_page(page); + rc = split_huge_page_to_list(page, from); + unlock_page(page); + if (!rc) { + list_safe_reset_next(page, page2, lru); + goto retry; + } + } nr_failed++; goto out; case -EAGAIN: @@ -1444,141 +1446,101 @@ out: } #ifdef CONFIG_NUMA -/* - * Move a list of individual pages - */ -struct page_to_node { - unsigned long addr; - struct page *page; - int node; - int status; -}; -static struct page *new_page_node(struct page *p, unsigned long private, - int **result) +static int store_status(int __user *status, int start, int value, int nr) { - struct page_to_node *pm = (struct page_to_node *)private; - - while (pm->node != MAX_NUMNODES && pm->page != p) - pm++; + while (nr-- > 0) { + if (put_user(value, status + start)) + return -EFAULT; + start++; + } - if (pm->node == MAX_NUMNODES) - return NULL; + return 0; +} - *result = &pm->status; +static int do_move_pages_to_node(struct mm_struct *mm, + struct list_head *pagelist, int node) +{ + int err; - if (PageHuge(p)) - return alloc_huge_page_node(page_hstate(compound_head(p)), - pm->node); - else if (thp_migration_supported() && PageTransHuge(p)) { - struct page *thp; + if (list_empty(pagelist)) + return 0; - thp = alloc_pages_node(pm->node, - (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, - HPAGE_PMD_ORDER); - if (!thp) - return NULL; - prep_transhuge_page(thp); - return thp; - } else - return __alloc_pages_node(pm->node, - GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); + err = migrate_pages(pagelist, alloc_new_node_page, NULL, node, + MIGRATE_SYNC, MR_SYSCALL); + if (err) + putback_movable_pages(pagelist); + return err; } /* - * Move a set of pages as indicated in the pm array. The addr - * field must be set to the virtual address of the page to be moved - * and the node number must contain a valid target node. - * The pm array ends with node = MAX_NUMNODES. + * Resolves the given address to a struct page, isolates it from the LRU and + * puts it to the given pagelist. + * Returns -errno if the page cannot be found/isolated or 0 when it has been + * queued or the page doesn't need to be migrated because it is already on + * the target node */ -static int do_move_page_to_node_array(struct mm_struct *mm, - struct page_to_node *pm, - int migrate_all) +static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, + int node, struct list_head *pagelist, bool migrate_all) { + struct vm_area_struct *vma; + struct page *page; + unsigned int follflags; int err; - struct page_to_node *pp; - LIST_HEAD(pagelist); down_read(&mm->mmap_sem); + err = -EFAULT; + vma = find_vma(mm, addr); + if (!vma || addr < vma->vm_start || !vma_migratable(vma)) + goto out; - /* - * Build a list of pages to migrate - */ - for (pp = pm; pp->node != MAX_NUMNODES; pp++) { - struct vm_area_struct *vma; - struct page *page; - struct page *head; - unsigned int follflags; - - err = -EFAULT; - vma = find_vma(mm, pp->addr); - if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) - goto set_status; - - /* FOLL_DUMP to ignore special (like zero) pages */ - follflags = FOLL_GET | FOLL_DUMP; - if (!thp_migration_supported()) - follflags |= FOLL_SPLIT; - page = follow_page(vma, pp->addr, follflags); + /* FOLL_DUMP to ignore special (like zero) pages */ + follflags = FOLL_GET | FOLL_DUMP; + page = follow_page(vma, addr, follflags); - err = PTR_ERR(page); - if (IS_ERR(page)) - goto set_status; + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; - err = -ENOENT; - if (!page) - goto set_status; + err = -ENOENT; + if (!page) + goto out; - err = page_to_nid(page); + err = 0; + if (page_to_nid(page) == node) + goto out_putpage; - if (err == pp->node) - /* - * Node already in the right place - */ - goto put_and_set; + err = -EACCES; + if (page_mapcount(page) > 1 && !migrate_all) + goto out_putpage; - err = -EACCES; - if (page_mapcount(page) > 1 && - !migrate_all) - goto put_and_set; - - if (PageHuge(page)) { - if (PageHead(page)) { - isolate_huge_page(page, &pagelist); - err = 0; - pp->page = page; - } - goto put_and_set; + if (PageHuge(page)) { + if (PageHead(page)) { + isolate_huge_page(page, pagelist); + err = 0; } + } else { + struct page *head; - pp->page = compound_head(page); head = compound_head(page); err = isolate_lru_page(head); - if (!err) { - list_add_tail(&head->lru, &pagelist); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_cache(head), - hpage_nr_pages(head)); - } -put_and_set: - /* - * Either remove the duplicate refcount from - * isolate_lru_page() or drop the page ref if it was - * not isolated. - */ - put_page(page); -set_status: - pp->status = err; - } - - err = 0; - if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_page_node, NULL, - (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); if (err) - putback_movable_pages(&pagelist); - } + goto out_putpage; + err = 0; + list_add_tail(&head->lru, pagelist); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + page_is_file_cache(head), + hpage_nr_pages(head)); + } +out_putpage: + /* + * Either remove the duplicate refcount from + * isolate_lru_page() or drop the page ref if it was + * not isolated. + */ + put_page(page); +out: up_read(&mm->mmap_sem); return err; } @@ -1593,79 +1555,79 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, const int __user *nodes, int __user *status, int flags) { - struct page_to_node *pm; - unsigned long chunk_nr_pages; - unsigned long chunk_start; - int err; - - err = -ENOMEM; - pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); - if (!pm) - goto out; + int current_node = NUMA_NO_NODE; + LIST_HEAD(pagelist); + int start, i; + int err = 0, err1; migrate_prep(); - /* - * Store a chunk of page_to_node array in a page, - * but keep the last one as a marker - */ - chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; - - for (chunk_start = 0; - chunk_start < nr_pages; - chunk_start += chunk_nr_pages) { - int j; + for (i = start = 0; i < nr_pages; i++) { + const void __user *p; + unsigned long addr; + int node; - if (chunk_start + chunk_nr_pages > nr_pages) - chunk_nr_pages = nr_pages - chunk_start; - - /* fill the chunk pm with addrs and nodes from user-space */ - for (j = 0; j < chunk_nr_pages; j++) { - const void __user *p; - int node; - - err = -EFAULT; - if (get_user(p, pages + j + chunk_start)) - goto out_pm; - pm[j].addr = (unsigned long) p; - - if (get_user(node, nodes + j + chunk_start)) - goto out_pm; - - err = -ENODEV; - if (node < 0 || node >= MAX_NUMNODES) - goto out_pm; - - if (!node_state(node, N_MEMORY)) - goto out_pm; - - err = -EACCES; - if (!node_isset(node, task_nodes)) - goto out_pm; + err = -EFAULT; + if (get_user(p, pages + i)) + goto out_flush; + if (get_user(node, nodes + i)) + goto out_flush; + addr = (unsigned long)p; + + err = -ENODEV; + if (node < 0 || node >= MAX_NUMNODES) + goto out_flush; + if (!node_state(node, N_MEMORY)) + goto out_flush; - pm[j].node = node; + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out_flush; + + if (current_node == NUMA_NO_NODE) { + current_node = node; + start = i; + } else if (node != current_node) { + err = do_move_pages_to_node(mm, &pagelist, current_node); + if (err) + goto out; + err = store_status(status, start, current_node, i - start); + if (err) + goto out; + start = i; + current_node = node; } - /* End marker for this chunk */ - pm[chunk_nr_pages].node = MAX_NUMNODES; - - /* Migrate this chunk */ - err = do_move_page_to_node_array(mm, pm, - flags & MPOL_MF_MOVE_ALL); - if (err < 0) - goto out_pm; + /* + * Errors in the page lookup or isolation are not fatal and we simply + * report them via status + */ + err = add_page_for_migration(mm, addr, current_node, + &pagelist, flags & MPOL_MF_MOVE_ALL); + if (!err) + continue; - /* Return status information */ - for (j = 0; j < chunk_nr_pages; j++) - if (put_user(pm[j].status, status + j + chunk_start)) { - err = -EFAULT; - goto out_pm; - } - } - err = 0; + err = store_status(status, i, err, 1); + if (err) + goto out_flush; -out_pm: - free_page((unsigned long)pm); + err = do_move_pages_to_node(mm, &pagelist, current_node); + if (err) + goto out; + if (i > start) { + err = store_status(status, start, current_node, i - start); + if (err) + goto out; + } + current_node = NUMA_NO_NODE; + } +out_flush: + /* Make sure we do not overwrite the existing error */ + err1 = do_move_pages_to_node(mm, &pagelist, current_node); + if (!err1) + err1 = store_status(status, start, current_node, i - start); + if (!err) + err = err1; out: return err; } @@ -1866,8 +1828,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, } static struct page *alloc_misplaced_dst_page(struct page *page, - unsigned long data, - int **result) + unsigned long data) { int nid = (int) data; struct page *newpage; @@ -1987,6 +1948,13 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, goto out; /* + * Also do not migrate dirty pages as not all filesystems can move + * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. + */ + if (page_is_file_cache(page) && PageDirty(page)) + goto out; + + /* * Rate-limit the amount of data that is being migrated to a node. * Optimal placement is no good if the memory bus is saturated and * all the time is being spent migrating! @@ -2339,7 +2307,8 @@ again: ptep_get_and_clear(mm, addr, ptep); /* Setup special migration page table entry */ - entry = make_migration_entry(page, pte_write(pte)); + entry = make_migration_entry(page, mpfn & + MIGRATE_PFN_WRITE); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pte)) swp_pte = pte_swp_mksoft_dirty(swp_pte); diff --git a/mm/mmap.c b/mm/mmap.c index f2154fc2548b..188f195883b9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1342,6 +1342,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC; + /* force arch specific MAP_FIXED handling in get_unmapped_area */ + if (flags & MAP_FIXED_NOREPLACE) + flags |= MAP_FIXED; + if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); @@ -1365,6 +1369,13 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (offset_in_page(addr)) return addr; + if (flags & MAP_FIXED_NOREPLACE) { + struct vm_area_struct *vma = find_vma(mm, addr); + + if (vma && vma->vm_start <= addr) + return -EEXIST; + } + if (prot == PROT_EXEC) { pkey = execute_only_pkey(mm); if (pkey < 0) diff --git a/mm/mprotect.c b/mm/mprotect.c index c1d6af7455da..625608bc8962 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -27,6 +27,7 @@ #include <linux/pkeys.h> #include <linux/ksm.h> #include <linux/uaccess.h> +#include <linux/mm_inline.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> @@ -89,6 +90,14 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page_mapcount(page) != 1) continue; + /* + * While migration can move some dirty pages, + * it cannot move them all from MIGRATE_ASYNC + * context. + */ + if (page_is_file_cache(page) && PageDirty(page)) + continue; + /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) continue; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 586f31261c83..5c1a3279e63f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2099,7 +2099,8 @@ void __init page_writeback_init(void) * so that it can tag pages faster than a dirtying process can create them). */ /* - * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. + * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock + * latency. */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -2109,22 +2110,22 @@ void tag_pages_for_writeback(struct address_space *mapping, struct radix_tree_iter iter; void **slot; - spin_lock_irq(&mapping->tree_lock); - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, + xa_lock_irq(&mapping->i_pages); + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, PAGECACHE_TAG_DIRTY) { if (iter.index > end) break; - radix_tree_iter_tag_set(&mapping->page_tree, &iter, + radix_tree_iter_tag_set(&mapping->i_pages, &iter, PAGECACHE_TAG_TOWRITE); tagged++; if ((tagged % WRITEBACK_TAG_BATCH) != 0) continue; slot = radix_tree_iter_resume(slot, &iter); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); cond_resched(); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } EXPORT_SYMBOL(tag_pages_for_writeback); @@ -2467,13 +2468,13 @@ int __set_page_dirty_nobuffers(struct page *page) return 1; } - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, page_index(page), + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); unlock_page_memcg(page); if (mapping->host) { @@ -2718,11 +2719,10 @@ int test_clear_page_writeback(struct page *page) struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); ret = TestClearPageWriteback(page); if (ret) { - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) { struct bdi_writeback *wb = inode_to_wb(inode); @@ -2736,7 +2736,7 @@ int test_clear_page_writeback(struct page *page) PAGECACHE_TAG_WRITEBACK)) sb_clear_inode_writeback(mapping->host); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); } else { ret = TestClearPageWriteback(page); } @@ -2766,7 +2766,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); ret = TestSetPageWriteback(page); if (!ret) { bool on_wblist; @@ -2774,8 +2774,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); @@ -2789,14 +2788,12 @@ int __test_set_page_writeback(struct page *page, bool keep_write) sb_mark_inode_writeback(mapping->host); } if (!PageDirty(page)) - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); if (!keep_write) - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_TOWRITE); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); } else { ret = TestSetPageWriteback(page); } @@ -2816,7 +2813,7 @@ EXPORT_SYMBOL(__test_set_page_writeback); */ int mapping_tagged(struct address_space *mapping, int tag) { - return radix_tree_tagged(&mapping->page_tree, tag); + return radix_tree_tagged(&mapping->i_pages, tag); } EXPORT_SYMBOL(mapping_tagged); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0b97b8ece4a9..905db9d7962f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -46,7 +46,6 @@ #include <linux/stop_machine.h> #include <linux/sort.h> #include <linux/pfn.h> -#include <xen/xen.h> #include <linux/backing-dev.h> #include <linux/fault-inject.h> #include <linux/page-isolation.h> @@ -205,17 +204,18 @@ static void __free_pages_ok(struct page *page, unsigned int order); * TBD: should special case ZONE_DMA32 machines here - in those we normally * don't need any ZONE_NORMAL reservation */ -int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { +int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { #ifdef CONFIG_ZONE_DMA - 256, + [ZONE_DMA] = 256, #endif #ifdef CONFIG_ZONE_DMA32 - 256, + [ZONE_DMA32] = 256, #endif + [ZONE_NORMAL] = 32, #ifdef CONFIG_HIGHMEM - 32, + [ZONE_HIGHMEM] = 0, #endif - 32, + [ZONE_MOVABLE] = 0, }; EXPORT_SYMBOL(totalram_pages); @@ -316,9 +316,6 @@ static inline bool update_defer_init(pg_data_t *pgdat, /* Always populate low zones for address-constrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; - /* Xen PV domains need page structures early */ - if (xen_pv_domain()) - return true; (*nr_initialised)++; if ((*nr_initialised > pgdat->static_init_pgcnt) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { @@ -1746,16 +1743,38 @@ void __init page_alloc_init_late(void) } #ifdef CONFIG_CMA +static void __init adjust_present_page_count(struct page *page, long count) +{ + struct zone *zone = page_zone(page); + + /* We don't need to hold a lock since it is boot-up process */ + zone->present_pages += count; +} + /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) { unsigned i = pageblock_nr_pages; + unsigned long pfn = page_to_pfn(page); struct page *p = page; + int nid = page_to_nid(page); + + /* + * ZONE_MOVABLE will steal present pages from other zones by + * changing page links so page_zone() is changed. Before that, + * we need to adjust previous zone's page count first. + */ + adjust_present_page_count(page, -pageblock_nr_pages); do { __ClearPageReserved(p); set_page_count(p, 0); - } while (++p, --i); + + /* Steal pages from other zones */ + set_page_links(p, ZONE_MOVABLE, nid, pfn); + } while (++p, ++pfn, --i); + + adjust_present_page_count(page, pageblock_nr_pages); set_pageblock_migratetype(page, MIGRATE_CMA); @@ -2870,7 +2889,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * exists. */ watermark = min_wmark_pages(zone) + (1UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) return 0; __mod_zone_freepage_state(zone, -(1UL << order), mt); @@ -3146,12 +3165,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } -#ifdef CONFIG_CMA - /* If allocation can't use CMA areas don't use free CMA pages */ - if (!(alloc_flags & ALLOC_CMA)) - free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); -#endif - /* * Check watermarks for an order-0 allocation request. If these * are not met, then a high-order request also cannot go ahead @@ -3178,10 +3191,8 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } #ifdef CONFIG_CMA - if ((alloc_flags & ALLOC_CMA) && - !list_empty(&area->free_list[MIGRATE_CMA])) { + if (!list_empty(&area->free_list[MIGRATE_CMA])) return true; - } #endif if (alloc_harder && !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) @@ -3201,13 +3212,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, unsigned int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); - long cma_pages = 0; - -#ifdef CONFIG_CMA - /* If allocation can't use CMA areas don't use free CMA pages */ - if (!(alloc_flags & ALLOC_CMA)) - cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); -#endif /* * Fast check for order-0 only. If this fails then the reserves @@ -3216,7 +3220,7 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, * the caller is !atomic then it'll uselessly search the free * list. That corner case is then slower but it is harmless. */ - if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) + if (!order && free_pages > mark + z->lowmem_reserve[classzone_idx]) return true; return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, @@ -3852,10 +3856,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; -#ifdef CONFIG_CMA - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; -#endif return alloc_flags; } @@ -4322,9 +4322,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, if (should_fail_alloc_page(gfp_mask, order)) return false; - if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) - *alloc_flags |= ALLOC_CMA; - return true; } @@ -4734,6 +4731,13 @@ long si_mem_available(void) min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + /* + * Part of the kernel memory, which can be released under memory + * pressure. + */ + available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >> + PAGE_SHIFT; + if (available < 0) available = 0; return available; @@ -6200,6 +6204,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; + unsigned long node_end_pfn = 0; pgdat_resize_init(pgdat); #ifdef CONFIG_NUMA_BALANCING @@ -6227,9 +6232,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long movable_size = 0; size = zone->spanned_pages; realsize = freesize = zone->present_pages; + if (zone_end_pfn(zone) > node_end_pfn) + node_end_pfn = zone_end_pfn(zone); + /* * Adjust freesize so that it accounts for how much memory @@ -6278,12 +6287,30 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) zone_seqlock_init(zone); zone_pcp_init(zone); - if (!size) + /* + * The size of the CMA area is unknown now so we need to + * prepare the memory for the usemap at maximum. + */ + if (IS_ENABLED(CONFIG_CMA) && j == ZONE_MOVABLE && + pgdat->node_spanned_pages) { + movable_size = node_end_pfn - pgdat->node_start_pfn; + } + + if (!size && !movable_size) continue; set_pageblock_order(); - setup_usemap(pgdat, zone, zone_start_pfn, size); - init_currently_empty_zone(zone, zone_start_pfn, size); + if (movable_size) { + zone->zone_start_pfn = pgdat->node_start_pfn; + zone->spanned_pages = movable_size; + setup_usemap(pgdat, zone, + pgdat->node_start_pfn, movable_size); + init_currently_empty_zone(zone, + pgdat->node_start_pfn, movable_size); + } else { + setup_usemap(pgdat, zone, zone_start_pfn, size); + init_currently_empty_zone(zone, zone_start_pfn, size); + } memmap_init(size, nid, j, zone_start_pfn); } } @@ -7125,13 +7152,15 @@ static void setup_per_zone_lowmem_reserve(void) struct zone *lower_zone; idx--; - - if (sysctl_lowmem_reserve_ratio[idx] < 1) - sysctl_lowmem_reserve_ratio[idx] = 1; - lower_zone = pgdat->node_zones + idx; - lower_zone->lowmem_reserve[j] = managed_pages / - sysctl_lowmem_reserve_ratio[idx]; + + if (sysctl_lowmem_reserve_ratio[idx] < 1) { + sysctl_lowmem_reserve_ratio[idx] = 0; + lower_zone->lowmem_reserve[j] = 0; + } else { + lower_zone->lowmem_reserve[j] = + managed_pages / sysctl_lowmem_reserve_ratio[idx]; + } managed_pages += lower_zone->managed_pages; } } @@ -7922,7 +7951,7 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) } #endif -#ifdef CONFIG_MEMORY_HOTPLUG +#if defined CONFIG_MEMORY_HOTPLUG || defined CONFIG_CMA /* * The zone indicated has a new number of managed_pages; batch sizes and percpu * page high values need to be recalulated. diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 61dee77bb211..43e085608846 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -309,8 +309,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, return pfn < end_pfn ? -EBUSY : 0; } -struct page *alloc_migrate_target(struct page *page, unsigned long private, - int **resultp) +struct page *alloc_migrate_target(struct page *page, unsigned long private) { return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]); } diff --git a/mm/readahead.c b/mm/readahead.c index 4d57b4644f98..539bbb6c1fad 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -175,7 +175,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, break; rcu_read_lock(); - page = radix_tree_lookup(&mapping->page_tree, page_offset); + page = radix_tree_lookup(&mapping->i_pages, page_offset); rcu_read_unlock(); if (page && !radix_tree_exceptional_entry(page)) continue; diff --git a/mm/rmap.c b/mm/rmap.c index 9122787c4947..f0dd4e4565bc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -32,11 +32,11 @@ * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) - * mapping->tree_lock (widely used) + * i_pages lock (widely used) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) - * mapping->tree_lock (widely used, in set_page_dirty, + * i_pages lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within bdi.wb->list_lock in __sync_single_inode) * diff --git a/mm/shmem.c b/mm/shmem.c index 4424fc0c33aa..9d6c7e595415 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -332,12 +332,12 @@ static int shmem_radix_tree_replace(struct address_space *mapping, VM_BUG_ON(!expected); VM_BUG_ON(!replacement); - item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot); + item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot); if (!item) return -ENOENT; if (item != expected) return -ENOENT; - __radix_tree_replace(&mapping->page_tree, node, pslot, + __radix_tree_replace(&mapping->i_pages, node, pslot, replacement, NULL); return 0; } @@ -355,7 +355,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, void *item; rcu_read_lock(); - item = radix_tree_lookup(&mapping->page_tree, index); + item = radix_tree_lookup(&mapping->i_pages, index); rcu_read_unlock(); return item == swp_to_radix_entry(swap); } @@ -590,14 +590,14 @@ static int shmem_add_to_page_cache(struct page *page, page->mapping = mapping; page->index = index; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); if (PageTransHuge(page)) { void __rcu **results; pgoff_t idx; int i; error = 0; - if (radix_tree_gang_lookup_slot(&mapping->page_tree, + if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, index, 1) && idx < index + HPAGE_PMD_NR) { error = -EEXIST; @@ -605,14 +605,14 @@ static int shmem_add_to_page_cache(struct page *page, if (!error) { for (i = 0; i < HPAGE_PMD_NR; i++) { - error = radix_tree_insert(&mapping->page_tree, + error = radix_tree_insert(&mapping->i_pages, index + i, page + i); VM_BUG_ON(error); } count_vm_event(THP_FILE_ALLOC); } } else if (!expected) { - error = radix_tree_insert(&mapping->page_tree, index, page); + error = radix_tree_insert(&mapping->i_pages, index, page); } else { error = shmem_radix_tree_replace(mapping, index, expected, page); @@ -624,10 +624,10 @@ static int shmem_add_to_page_cache(struct page *page, __inc_node_page_state(page, NR_SHMEM_THPS); __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } else { page->mapping = NULL; - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); page_ref_sub(page, nr); } return error; @@ -643,13 +643,13 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) VM_BUG_ON_PAGE(PageCompound(page), page); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); error = shmem_radix_tree_replace(mapping, page->index, page, radswap); page->mapping = NULL; mapping->nrpages--; __dec_node_page_state(page, NR_FILE_PAGES); __dec_node_page_state(page, NR_SHMEM); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); put_page(page); BUG_ON(error); } @@ -662,9 +662,9 @@ static int shmem_free_swap(struct address_space *mapping, { void *old; - spin_lock_irq(&mapping->tree_lock); - old = radix_tree_delete_item(&mapping->page_tree, index, radswap); - spin_unlock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); + old = radix_tree_delete_item(&mapping->i_pages, index, radswap); + xa_unlock_irq(&mapping->i_pages); if (old != radswap) return -ENOENT; free_swap_and_cache(radix_to_swp_entry(radswap)); @@ -675,7 +675,7 @@ static int shmem_free_swap(struct address_space *mapping, * Determine (in bytes) how many of the shmem object's pages mapped by the * given offsets are swapped out. * - * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, + * This is safe to call without i_mutex or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_partial_swap_usage(struct address_space *mapping, @@ -688,7 +688,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (iter.index >= end) break; @@ -717,7 +717,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, * Determine (in bytes) how many of the shmem object's pages mapped by the * given vma is swapped out. * - * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, + * This is safe to call without i_mutex or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_swap_usage(struct vm_area_struct *vma) @@ -1132,7 +1132,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, int error = 0; radswap = swp_to_radix_entry(swap); - index = find_swap_entry(&mapping->page_tree, radswap); + index = find_swap_entry(&mapping->i_pages, radswap); if (index == -1) return -EAGAIN; /* tell shmem_unuse we found nothing */ @@ -1448,7 +1448,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, hindex = round_down(index, HPAGE_PMD_NR); rcu_read_lock(); - if (radix_tree_gang_lookup_slot(&mapping->page_tree, &results, &idx, + if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, hindex, 1) && idx < hindex + HPAGE_PMD_NR) { rcu_read_unlock(); return NULL; @@ -1561,14 +1561,14 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * Our caller will very soon move newpage out of swapcache, but it's * a nice clean interface for us to replace oldpage by newpage there. */ - spin_lock_irq(&swap_mapping->tree_lock); + xa_lock_irq(&swap_mapping->i_pages); error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, newpage); if (!error) { __inc_node_page_state(newpage, NR_FILE_PAGES); __dec_node_page_state(oldpage, NR_FILE_PAGES); } - spin_unlock_irq(&swap_mapping->tree_lock); + xa_unlock_irq(&swap_mapping->i_pages); if (unlikely(error)) { /* @@ -2634,7 +2634,7 @@ static void shmem_tag_pins(struct address_space *mapping) start = 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { page = radix_tree_deref_slot(slot); if (!page || radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { @@ -2642,10 +2642,10 @@ static void shmem_tag_pins(struct address_space *mapping) continue; } } else if (page_count(page) - page_mapcount(page) > 1) { - spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_set(&mapping->page_tree, iter.index, + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_set(&mapping->i_pages, iter.index, SHMEM_TAG_PINNED); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } if (need_resched()) { @@ -2677,7 +2677,7 @@ static int shmem_wait_for_pins(struct address_space *mapping) error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { - if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED)) + if (!radix_tree_tagged(&mapping->i_pages, SHMEM_TAG_PINNED)) break; if (!scan) @@ -2687,7 +2687,7 @@ static int shmem_wait_for_pins(struct address_space *mapping) start = 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, SHMEM_TAG_PINNED) { page = radix_tree_deref_slot(slot); @@ -2713,10 +2713,10 @@ static int shmem_wait_for_pins(struct address_space *mapping) error = -EBUSY; } - spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_clear(&mapping->page_tree, + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_clear(&mapping->i_pages, iter.index, SHMEM_TAG_PINNED); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); continue_resched: if (need_resched()) { slot = radix_tree_iter_resume(slot, &iter); diff --git a/mm/slub.c b/mm/slub.c index 4fb037c98782..44aa7847324a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1363,10 +1363,8 @@ static __always_inline void kfree_hook(void *x) kasan_kfree_large(x, _RET_IP_); } -static __always_inline void *slab_free_hook(struct kmem_cache *s, void *x) +static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) { - void *freeptr; - kmemleak_free_recursive(x, s->flags); /* @@ -1386,17 +1384,12 @@ static __always_inline void *slab_free_hook(struct kmem_cache *s, void *x) if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(x, s->object_size); - freeptr = get_freepointer(s, x); - /* - * kasan_slab_free() may put x into memory quarantine, delaying its - * reuse. In this case the object's freelist pointer is changed. - */ - kasan_slab_free(s, x, _RET_IP_); - return freeptr; + /* KASAN might put x into memory quarantine, delaying its reuse */ + return kasan_slab_free(s, x, _RET_IP_); } -static inline void slab_free_freelist_hook(struct kmem_cache *s, - void *head, void *tail) +static inline bool slab_free_freelist_hook(struct kmem_cache *s, + void **head, void **tail) { /* * Compiler cannot detect this function can be removed if slab_free_hook() @@ -1407,13 +1400,33 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s, defined(CONFIG_DEBUG_OBJECTS_FREE) || \ defined(CONFIG_KASAN) - void *object = head; - void *tail_obj = tail ? : head; - void *freeptr; + void *object; + void *next = *head; + void *old_tail = *tail ? *tail : *head; + + /* Head and tail of the reconstructed freelist */ + *head = NULL; + *tail = NULL; do { - freeptr = slab_free_hook(s, object); - } while ((object != tail_obj) && (object = freeptr)); + object = next; + next = get_freepointer(s, object); + /* If object's reuse doesn't have to be delayed */ + if (!slab_free_hook(s, object)) { + /* Move object to the new freelist */ + set_freepointer(s, object, *head); + *head = object; + if (!*tail) + *tail = object; + } + } while (object != old_tail); + + if (*head == *tail) + *tail = NULL; + + return *head != NULL; +#else + return true; #endif } @@ -2968,14 +2981,12 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page, void *head, void *tail, int cnt, unsigned long addr) { - slab_free_freelist_hook(s, head, tail); /* - * slab_free_freelist_hook() could have put the items into quarantine. - * If so, no need to free them. + * With KASAN enabled slab_free_freelist_hook modifies the freelist + * to remove objects, whose reuse must be delayed. */ - if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU)) - return; - do_slab_free(s, page, head, tail, cnt, addr); + if (slab_free_freelist_hook(s, &head, &tail)) + do_slab_free(s, page, head, tail, cnt, addr); } #ifdef CONFIG_KASAN diff --git a/mm/swap_state.c b/mm/swap_state.c index f233dccd3b1b..07f9aa2340c3 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -124,10 +124,10 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) SetPageSwapCache(page); address_space = swap_address_space(entry); - spin_lock_irq(&address_space->tree_lock); + xa_lock_irq(&address_space->i_pages); for (i = 0; i < nr; i++) { set_page_private(page + i, entry.val + i); - error = radix_tree_insert(&address_space->page_tree, + error = radix_tree_insert(&address_space->i_pages, idx + i, page + i); if (unlikely(error)) break; @@ -145,13 +145,13 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) VM_BUG_ON(error == -EEXIST); set_page_private(page + i, 0UL); while (i--) { - radix_tree_delete(&address_space->page_tree, idx + i); + radix_tree_delete(&address_space->i_pages, idx + i); set_page_private(page + i, 0UL); } ClearPageSwapCache(page); page_ref_sub(page, nr); } - spin_unlock_irq(&address_space->tree_lock); + xa_unlock_irq(&address_space->i_pages); return error; } @@ -188,7 +188,7 @@ void __delete_from_swap_cache(struct page *page) address_space = swap_address_space(entry); idx = swp_offset(entry); for (i = 0; i < nr; i++) { - radix_tree_delete(&address_space->page_tree, idx + i); + radix_tree_delete(&address_space->i_pages, idx + i); set_page_private(page + i, 0); } ClearPageSwapCache(page); @@ -272,9 +272,9 @@ void delete_from_swap_cache(struct page *page) entry.val = page_private(page); address_space = swap_address_space(entry); - spin_lock_irq(&address_space->tree_lock); + xa_lock_irq(&address_space->i_pages); __delete_from_swap_cache(page); - spin_unlock_irq(&address_space->tree_lock); + xa_unlock_irq(&address_space->i_pages); put_swap_page(page, entry); page_ref_sub(page, hpage_nr_pages(page)); @@ -628,12 +628,11 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages) return -ENOMEM; for (i = 0; i < nr; i++) { space = spaces + i; - INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN); + INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN); atomic_set(&space->i_mmap_writable, 0); space->a_ops = &swap_aops; /* swap cache doesn't use writeback related tags */ mapping_set_no_writeback_tags(space); - spin_lock_init(&space->tree_lock); } nr_swapper_spaces[type] = nr; rcu_assign_pointer(swapper_spaces[type], spaces); diff --git a/mm/swapfile.c b/mm/swapfile.c index c7a33717d079..cc2cf04d9018 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head); * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ -struct plist_head *swap_avail_heads; +static struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -2961,6 +2961,10 @@ static unsigned long read_swap_header(struct swap_info_struct *p, maxpages = swp_offset(pte_to_swp_entry( swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; last_page = swap_header->info.last_page; + if (!last_page) { + pr_warn("Empty swap-file\n"); + return 0; + } if (last_page > maxpages) { pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", maxpages << (PAGE_SHIFT - 10), diff --git a/mm/truncate.c b/mm/truncate.c index c34e2fd4f583..1d2fb2dca96f 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -36,11 +36,11 @@ static inline void __clear_shadow_entry(struct address_space *mapping, struct radix_tree_node *node; void **slot; - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) + if (!__radix_tree_lookup(&mapping->i_pages, index, &node, &slot)) return; if (*slot != entry) return; - __radix_tree_replace(&mapping->page_tree, node, slot, NULL, + __radix_tree_replace(&mapping->i_pages, node, slot, NULL, workingset_update_node); mapping->nrexceptional--; } @@ -48,9 +48,9 @@ static inline void __clear_shadow_entry(struct address_space *mapping, static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, void *entry) { - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); __clear_shadow_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } /* @@ -79,7 +79,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, dax = dax_mapping(mapping); lock = !dax && indices[j] < end; if (lock) - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); for (i = j; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -102,7 +102,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, } if (lock) - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); pvec->nr = j; } @@ -518,8 +518,8 @@ void truncate_inode_pages_final(struct address_space *mapping) * modification that does not see AS_EXITING is * completed before starting the final truncate. */ - spin_lock_irq(&mapping->tree_lock); - spin_unlock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); + xa_unlock_irq(&mapping->i_pages); truncate_inode_pages(mapping, 0); } @@ -627,13 +627,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); if (PageDirty(page)) goto failed; BUG_ON(page_has_private(page)); __delete_from_page_cache(page, NULL); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -641,7 +641,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) put_page(page); /* pagecache ref */ return 1; failed: - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); return 0; } diff --git a/mm/util.c b/mm/util.c index 029fc2f3b395..1fc4fa7576f7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -287,7 +287,7 @@ int vma_is_stack_for_current(struct vm_area_struct *vma) } #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; @@ -668,6 +668,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free += global_node_page_state(NR_SLAB_RECLAIMABLE); /* + * Part of the kernel memory, which can be released + * under memory pressure. + */ + free += global_node_page_state( + NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT; + + /* * Leave reserved pages. The pages are not for anonymous pages. */ if (free <= totalreserve_pages) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4390a8d5be41..8b920ce3ae02 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -116,6 +116,16 @@ struct scan_control { /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; + + struct { + unsigned int dirty; + unsigned int unqueued_dirty; + unsigned int congested; + unsigned int writeback; + unsigned int immediate; + unsigned int file_taken; + unsigned int taken; + } nr; }; #ifdef ARCH_HAS_PREFETCH @@ -190,6 +200,29 @@ static bool sane_reclaim(struct scan_control *sc) #endif return false; } + +static void set_memcg_congestion(pg_data_t *pgdat, + struct mem_cgroup *memcg, + bool congested) +{ + struct mem_cgroup_per_node *mn; + + if (!memcg) + return; + + mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + WRITE_ONCE(mn->congested, congested); +} + +static bool memcg_congested(pg_data_t *pgdat, + struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *mn; + + mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + return READ_ONCE(mn->congested); + +} #else static bool global_reclaim(struct scan_control *sc) { @@ -200,6 +233,18 @@ static bool sane_reclaim(struct scan_control *sc) { return true; } + +static inline void set_memcg_congestion(struct pglist_data *pgdat, + struct mem_cgroup *memcg, bool congested) +{ +} + +static inline bool memcg_congested(struct pglist_data *pgdat, + struct mem_cgroup *memcg) +{ + return false; + +} #endif /* @@ -648,7 +693,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); /* * The non racy check for a busy page. * @@ -672,7 +717,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * load is not satisfied before that of page->_refcount. * * Note that if SetPageDirty is always performed via set_page_dirty, - * and thus under tree_lock, then this ordering is not required. + * and thus under the i_pages lock, then this ordering is not required. */ if (unlikely(PageTransHuge(page)) && PageSwapCache(page)) refcount = 1 + HPAGE_PMD_NR; @@ -690,7 +735,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, swp_entry_t swap = { .val = page_private(page) }; mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); } else { void (*freepage)(struct page *); @@ -711,13 +756,13 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * only page cache pages found in these are zero pages * covering holes, and because we don't want to mix DAX * exceptional entries and shadow exceptional entries in the - * same page_tree. + * same address_space. */ if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(mapping, page); __delete_from_page_cache(page, shadow); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); if (freepage != NULL) freepage(page); @@ -726,7 +771,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, return 1; cannot_free: - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); return 0; } @@ -857,17 +902,6 @@ static void page_check_dirty_writeback(struct page *page, mapping->a_ops->is_dirty_writeback(page, dirty, writeback); } -struct reclaim_stat { - unsigned nr_dirty; - unsigned nr_unqueued_dirty; - unsigned nr_congested; - unsigned nr_writeback; - unsigned nr_immediate; - unsigned nr_activate; - unsigned nr_ref_keep; - unsigned nr_unmap_fail; -}; - /* * shrink_page_list() returns the number of reclaimed pages */ @@ -926,7 +960,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); /* - * The number of dirty pages determines if a zone is marked + * The number of dirty pages determines if a node is marked * reclaim_congested which affects wait_iff_congested. kswapd * will stall and start writing pages if the tail of the LRU * is all dirty unqueued pages. @@ -1755,23 +1789,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, free_unref_page_list(&page_list); /* - * If reclaim is isolating dirty pages under writeback, it implies - * that the long-lived page allocation rate is exceeding the page - * laundering rate. Either the global limits are not being effective - * at throttling processes due to the page distribution throughout - * zones or there is heavy usage of a slow backing device. The - * only option is to throttle from reclaim context which is not ideal - * as there is no guarantee the dirtying process is throttled in the - * same way balance_dirty_pages() manages. - * - * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number - * of pages under pages flagged for immediate reclaim and stall if any - * are encountered in the nr_immediate check below. - */ - if (stat.nr_writeback && stat.nr_writeback == nr_taken) - set_bit(PGDAT_WRITEBACK, &pgdat->flags); - - /* * If dirty pages are scanned that are not queued for IO, it * implies that flushers are not doing their job. This can * happen when memory pressure pushes dirty pages to the end of @@ -1785,48 +1802,17 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (stat.nr_unqueued_dirty == nr_taken) wakeup_flusher_threads(WB_REASON_VMSCAN); - /* - * Legacy memcg will stall in page writeback so avoid forcibly - * stalling here. - */ - if (sane_reclaim(sc)) { - /* - * Tag a zone as congested if all the dirty pages scanned were - * backed by a congested BDI and wait_iff_congested will stall. - */ - if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested) - set_bit(PGDAT_CONGESTED, &pgdat->flags); - - /* Allow kswapd to start writing pages during reclaim. */ - if (stat.nr_unqueued_dirty == nr_taken) - set_bit(PGDAT_DIRTY, &pgdat->flags); - - /* - * If kswapd scans pages marked marked for immediate - * reclaim and under writeback (nr_immediate), it implies - * that pages are cycling through the LRU faster than - * they are written so also forcibly stall. - */ - if (stat.nr_immediate && current_may_throttle()) - congestion_wait(BLK_RW_ASYNC, HZ/10); - } - - /* - * Stall direct reclaim for IO completions if underlying BDIs or zone - * is congested. Allow kswapd to continue until it starts encountering - * unqueued dirty pages or cycling through the LRU too quickly. - */ - if (!sc->hibernation_mode && !current_is_kswapd() && - current_may_throttle()) - wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10); + sc->nr.dirty += stat.nr_dirty; + sc->nr.congested += stat.nr_congested; + sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; + sc->nr.writeback += stat.nr_writeback; + sc->nr.immediate += stat.nr_immediate; + sc->nr.taken += nr_taken; + if (file) + sc->nr.file_taken += nr_taken; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, - nr_scanned, nr_reclaimed, - stat.nr_dirty, stat.nr_writeback, - stat.nr_congested, stat.nr_immediate, - stat.nr_activate, stat.nr_ref_keep, - stat.nr_unmap_fail, - sc->priority, file); + nr_scanned, nr_reclaimed, &stat, sc->priority, file); return nr_reclaimed; } @@ -2507,6 +2493,12 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return true; } +static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) +{ + return test_bit(PGDAT_CONGESTED, &pgdat->flags) || + (memcg && memcg_congested(pgdat, memcg)); +} + static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) { struct reclaim_state *reclaim_state = current->reclaim_state; @@ -2522,6 +2514,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long node_lru_pages = 0; struct mem_cgroup *memcg; + memset(&sc->nr, 0, sizeof(sc->nr)); + nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; @@ -2536,7 +2530,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) sc->memcg_low_skipped = 1; continue; } - mem_cgroup_event(memcg, MEMCG_LOW); + memcg_memory_event(memcg, MEMCG_LOW); } reclaimed = sc->nr_reclaimed; @@ -2587,6 +2581,67 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; + if (current_is_kswapd()) { + /* + * If reclaim is isolating dirty pages under writeback, + * it implies that the long-lived page allocation rate + * is exceeding the page laundering rate. Either the + * global limits are not being effective at throttling + * processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing + * device. The only option is to throttle from reclaim + * context which is not ideal as there is no guarantee + * the dirtying process is throttled in the same way + * balance_dirty_pages() manages. + * + * Once a node is flagged PGDAT_WRITEBACK, kswapd will + * count the number of pages under pages flagged for + * immediate reclaim and stall if any are encountered + * in the nr_immediate check below. + */ + if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) + set_bit(PGDAT_WRITEBACK, &pgdat->flags); + + /* + * Tag a node as congested if all the dirty pages + * scanned were backed by a congested BDI and + * wait_iff_congested will stall. + */ + if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_bit(PGDAT_CONGESTED, &pgdat->flags); + + /* Allow kswapd to start writing pages during reclaim.*/ + if (sc->nr.unqueued_dirty == sc->nr.file_taken) + set_bit(PGDAT_DIRTY, &pgdat->flags); + + /* + * If kswapd scans pages marked marked for immediate + * reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU + * faster than they are written so also forcibly stall. + */ + if (sc->nr.immediate) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } + + /* + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling in wait_iff_congested(). + */ + if (!global_reclaim(sc) && sane_reclaim(sc) && + sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_memcg_congestion(pgdat, root, true); + + /* + * Stall direct reclaim for IO completions if underlying BDIs + * and node is congested. Allow kswapd to continue until it + * starts encountering unqueued dirty pages or cycling through + * the LRU too quickly. + */ + if (!sc->hibernation_mode && !current_is_kswapd() && + current_may_throttle() && pgdat_memcg_congested(pgdat, root)) + wait_iff_congested(BLK_RW_ASYNC, HZ/10); + } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); @@ -2802,6 +2857,7 @@ retry: continue; last_pgdat = zone->zone_pgdat; snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); + set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false); } delayacct_freepages_end(); @@ -3808,7 +3864,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { /* - * Free memory by calling shrink zone with increasing + * Free memory by calling shrink node with increasing * priorities until we have enough memory freed. */ do { diff --git a/mm/vmstat.c b/mm/vmstat.c index 33581be705f0..536332e988b8 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1161,6 +1161,7 @@ const char * const vmstat_text[] = { "nr_vmscan_immediate_reclaim", "nr_dirtied", "nr_written", + "nr_indirectly_reclaimable", /* enum writeback_stat_item counters */ "nr_dirty_threshold", diff --git a/mm/workingset.c b/mm/workingset.c index b7d616a3bbbe..40ee02c83978 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -202,7 +202,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, * @mapping: address space the page was backing * @page: the page being evicted * - * Returns a shadow entry to be stored in @mapping->page_tree in place + * Returns a shadow entry to be stored in @mapping->i_pages in place * of the evicted @page so that a later refault can be detected. */ void *workingset_eviction(struct address_space *mapping, struct page *page) @@ -348,7 +348,7 @@ void workingset_update_node(struct radix_tree_node *node) * * Avoid acquiring the list_lru lock when the nodes are * already where they should be. The list_empty() test is safe - * as node->private_list is protected by &mapping->tree_lock. + * as node->private_list is protected by the i_pages lock. */ if (node->count && node->count == node->exceptional) { if (list_empty(&node->private_list)) @@ -366,7 +366,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, unsigned long nodes; unsigned long cache; - /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + /* list_lru lock nests inside the IRQ-safe i_pages lock */ local_irq_disable(); nodes = list_lru_shrink_count(&shadow_nodes, sc); local_irq_enable(); @@ -419,21 +419,21 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, /* * Page cache insertions and deletions synchroneously maintain - * the shadow node LRU under the mapping->tree_lock and the + * the shadow node LRU under the i_pages lock and the * lru_lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the lru_lock pins any * address_space that has radix tree nodes on the LRU. * - * We can then safely transition to the mapping->tree_lock to + * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the lru_lock. */ node = container_of(item, struct radix_tree_node, private_list); - mapping = container_of(node->root, struct address_space, page_tree); + mapping = container_of(node->root, struct address_space, i_pages); /* Coming from the list, invert the lock order */ - if (!spin_trylock(&mapping->tree_lock)) { + if (!xa_trylock(&mapping->i_pages)) { spin_unlock(lru_lock); ret = LRU_RETRY; goto out; @@ -468,11 +468,11 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); - __radix_tree_delete_node(&mapping->page_tree, node, + __radix_tree_delete_node(&mapping->i_pages, node, workingset_lookup_update(mapping)); out_invalid: - spin_unlock(&mapping->tree_lock); + xa_unlock(&mapping->i_pages); ret = LRU_REMOVED_RETRY; out: local_irq_enable(); @@ -487,7 +487,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, { unsigned long ret; - /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + /* list_lru lock nests inside the IRQ-safe i_pages lock */ local_irq_disable(); ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL); local_irq_enable(); @@ -503,7 +503,7 @@ static struct shrinker workingset_shadow_shrinker = { /* * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe - * mapping->tree_lock. + * i_pages lock. */ static struct lock_class_key shadow_nodes_key; diff --git a/mm/z3fold.c b/mm/z3fold.c index f579ad4a8100..c0bca6153b95 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -467,6 +467,8 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, spin_lock_init(&pool->lock); spin_lock_init(&pool->stale_lock); pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); + if (!pool->unbuddied) + goto out_pool; for_each_possible_cpu(cpu) { struct list_head *unbuddied = per_cpu_ptr(pool->unbuddied, cpu); @@ -479,7 +481,7 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, pool->name = name; pool->compact_wq = create_singlethread_workqueue(pool->name); if (!pool->compact_wq) - goto out; + goto out_unbuddied; pool->release_wq = create_singlethread_workqueue(pool->name); if (!pool->release_wq) goto out_wq; @@ -489,8 +491,11 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, out_wq: destroy_workqueue(pool->compact_wq); -out: +out_unbuddied: + free_percpu(pool->unbuddied); +out_pool: kfree(pool); +out: return NULL; } @@ -533,7 +538,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, struct z3fold_header *zhdr = NULL; struct page *page = NULL; enum buddy bud; - bool can_sleep = (gfp & __GFP_RECLAIM) == __GFP_RECLAIM; + bool can_sleep = gfpflags_allow_blocking(gfp); if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 764ffd1bb1c5..e16d6713f236 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -791,7 +791,8 @@ our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant|$String)}; our $declaration_macros = qr{(?x: (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,6}\s*\(| (?:$Storage\s+)?[HLP]?LIST_HEAD\s*\(| - (?:$Storage\s+)?${Type}\s+uninitialized_var\s*\( + (?:$Storage\s+)?${Type}\s+uninitialized_var\s*\(| + (?:SKCIPHER_REQUEST|SHASH_DESC|AHASH_REQUEST)_ON_STACK\s*\( )}; sub deparenthesize { @@ -1075,7 +1076,7 @@ sub parse_email { } elsif ($formatted_email =~ /(\S+\@\S+)(.*)$/) { $address = $1; $comment = $2 if defined $2; - $formatted_email =~ s/$address.*$//; + $formatted_email =~ s/\Q$address\E.*$//; $name = $formatted_email; $name = trim($name); $name =~ s/^\"|\"$//g; @@ -1217,7 +1218,7 @@ sub sanitise_line { for ($off = 1; $off < length($line); $off++) { $c = substr($line, $off, 1); - # Comments we are wacking completly including the begin + # Comments we are whacking completely including the begin # and end, all to $;. if ($sanitise_quote eq '' && substr($line, $off, 2) eq '/*') { $sanitise_quote = '*/'; @@ -1297,6 +1298,7 @@ sub sanitise_line { sub get_quoted_string { my ($line, $rawline) = @_; + return "" if (!defined($line) || !defined($rawline)); return "" if ($line !~ m/($String)/g); return substr($rawline, $-[0], $+[0] - $-[0]); } @@ -1644,6 +1646,28 @@ sub raw_line { return $line; } +sub get_stat_real { + my ($linenr, $lc) = @_; + + my $stat_real = raw_line($linenr, 0); + for (my $count = $linenr + 1; $count <= $lc; $count++) { + $stat_real = $stat_real . "\n" . raw_line($count, 0); + } + + return $stat_real; +} + +sub get_stat_here { + my ($linenr, $cnt, $here) = @_; + + my $herectx = $here . "\n"; + for (my $n = 0; $n < $cnt; $n++) { + $herectx .= raw_line($linenr, $n) . "\n"; + } + + return $herectx; +} + sub cat_vet { my ($vet) = @_; my ($res, $coded); @@ -2257,6 +2281,8 @@ sub process { my $camelcase_file_seeded = 0; + my $checklicenseline = 1; + sanitise_line_reset(); my $line; foreach my $rawline (@rawlines) { @@ -2448,6 +2474,7 @@ sub process { } else { $check = $check_orig; } + $checklicenseline = 1; next; } @@ -2911,6 +2938,30 @@ sub process { } } +# check for using SPDX license tag at beginning of files + if ($realline == $checklicenseline) { + if ($rawline =~ /^[ \+]\s*\#\!\s*\//) { + $checklicenseline = 2; + } elsif ($rawline =~ /^\+/) { + my $comment = ""; + if ($realfile =~ /\.(h|s|S)$/) { + $comment = '/*'; + } elsif ($realfile =~ /\.(c|dts|dtsi)$/) { + $comment = '//'; + } elsif (($checklicenseline == 2) || $realfile =~ /\.(sh|pl|py|awk|tc)$/) { + $comment = '#'; + } elsif ($realfile =~ /\.rst$/) { + $comment = '..'; + } + + if ($comment !~ /^$/ && + $rawline !~ /^\+\Q$comment\E SPDX-License-Identifier: /) { + WARN("SPDX_LICENSE_TAG", + "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr); + } + } + } + # check we are in a valid source file if not then ignore this hunk next if ($realfile !~ /\.(h|c|s|S|sh|dtsi|dts)$/); @@ -3011,6 +3062,12 @@ sub process { } } +# check for assignments on the start of a line + if ($sline =~ /^\+\s+($Assignment)[^=]/) { + CHK("ASSIGNMENT_CONTINUATIONS", + "Assignment operator '$1' should be on the previous line\n" . $hereprev); + } + # check for && or || at the start of a line if ($rawline =~ /^\+\s*(&&|\|\|)/) { CHK("LOGICAL_CONTINUATIONS", @@ -4032,7 +4089,7 @@ sub process { my ($where, $prefix) = ($-[1], $1); if ($prefix !~ /$Type\s+$/ && ($where != 0 || $prefix !~ /^.\s+$/) && - $prefix !~ /[{,]\s+$/) { + $prefix !~ /[{,:]\s+$/) { if (ERROR("BRACKET_SPACE", "space prohibited before open square bracket '['\n" . $herecurr) && $fix) { @@ -4928,12 +4985,8 @@ sub process { #print "REST<$rest> dstat<$dstat> ctx<$ctx>\n"; $ctx =~ s/\n*$//; - my $herectx = $here . "\n"; my $stmt_cnt = statement_rawlines($ctx); - - for (my $n = 0; $n < $stmt_cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } + my $herectx = get_stat_here($linenr, $stmt_cnt, $here); if ($dstat ne '' && $dstat !~ /^(?:$Ident|-?$Constant),$/ && # 10, // foo(), @@ -5005,12 +5058,9 @@ sub process { # check for macros with flow control, but without ## concatenation # ## concatenation is commonly a macro that defines a function so ignore those if ($has_flow_statement && !$has_arg_concat) { - my $herectx = $here . "\n"; my $cnt = statement_rawlines($ctx); + my $herectx = get_stat_here($linenr, $cnt, $here); - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } WARN("MACRO_WITH_FLOW_CONTROL", "Macros with flow control statements should be avoided\n" . "$herectx"); } @@ -5050,11 +5100,7 @@ sub process { $ctx =~ s/\n*$//; my $cnt = statement_rawlines($ctx); - my $herectx = $here . "\n"; - - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } + my $herectx = get_stat_here($linenr, $cnt, $here); if (($stmts =~ tr/;/;/) == 1 && $stmts !~ /^\s*(if|while|for|switch)\b/) { @@ -5068,11 +5114,7 @@ sub process { } elsif ($dstat =~ /^\+\s*#\s*define\s+$Ident.*;\s*$/) { $ctx =~ s/\n*$//; my $cnt = statement_rawlines($ctx); - my $herectx = $here . "\n"; - - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } + my $herectx = get_stat_here($linenr, $cnt, $here); WARN("TRAILING_SEMICOLON", "macros should not use a trailing semicolon\n" . "$herectx"); @@ -5195,12 +5237,8 @@ sub process { } } if ($level == 0 && $block =~ /^\s*\{/ && !$allowed) { - my $herectx = $here . "\n"; my $cnt = statement_rawlines($block); - - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } + my $herectx = get_stat_here($linenr, $cnt, $here); WARN("BRACES", "braces {} are not necessary for single statement blocks\n" . $herectx); @@ -5776,36 +5814,50 @@ sub process { } } - # check for vsprintf extension %p<foo> misuses +# check for vsprintf extension %p<foo> misuses if ($^V && $^V ge 5.10.0 && defined $stat && $stat =~ /^\+(?![^\{]*\{\s*).*\b(\w+)\s*\(.*$String\s*,/s && $1 !~ /^_*volatile_*$/) { - my $bad_extension = ""; + my $specifier; + my $extension; + my $bad_specifier = ""; + my $stat_real; + my $lc = $stat =~ tr@\n@@; $lc = $lc + $linenr; for (my $count = $linenr; $count <= $lc; $count++) { my $fmt = get_quoted_string($lines[$count - 1], raw_line($count, 0)); $fmt =~ s/%%//g; - if ($fmt =~ /(\%[\*\d\.]*p(?![\WSsBKRraEhMmIiUDdgVCbGNOx]).)/) { - $bad_extension = $1; - last; - } - } - if ($bad_extension ne "") { - my $stat_real = raw_line($linenr, 0); - my $ext_type = "Invalid"; - my $use = ""; - for (my $count = $linenr + 1; $count <= $lc; $count++) { - $stat_real = $stat_real . "\n" . raw_line($count, 0); + + while ($fmt =~ /(\%[\*\d\.]*p(\w))/g) { + $specifier = $1; + $extension = $2; + if ($extension !~ /[SsBKRraEhMmIiUDdgVCbGNOx]/) { + $bad_specifier = $specifier; + last; + } + if ($extension eq "x" && !defined($stat_real)) { + if (!defined($stat_real)) { + $stat_real = get_stat_real($linenr, $lc); + } + WARN("VSPRINTF_SPECIFIER_PX", + "Using vsprintf specifier '\%px' potentially exposes the kernel memory layout, if you don't really need the address please consider using '\%p'.\n" . "$here\n$stat_real\n"); + } } - if ($bad_extension =~ /p[Ff]/) { - $ext_type = "Deprecated"; - $use = " - use %pS instead"; - $use =~ s/pS/ps/ if ($bad_extension =~ /pf/); + if ($bad_specifier ne "") { + my $stat_real = get_stat_real($linenr, $lc); + my $ext_type = "Invalid"; + my $use = ""; + if ($bad_specifier =~ /p[Ff]/) { + $ext_type = "Deprecated"; + $use = " - use %pS instead"; + $use =~ s/pS/ps/ if ($bad_specifier =~ /pf/); + } + + WARN("VSPRINTF_POINTER_EXTENSION", + "$ext_type vsprintf pointer extension '$bad_specifier'$use\n" . "$here\n$stat_real\n"); } - WARN("VSPRINTF_POINTER_EXTENSION", - "$ext_type vsprintf pointer extension '$bad_extension'$use\n" . "$here\n$stat_real\n"); } } @@ -5918,10 +5970,7 @@ sub process { $stat !~ /(?:$Compare)\s*\bsscanf\s*$balanced_parens/)) { my $lc = $stat =~ tr@\n@@; $lc = $lc + $linenr; - my $stat_real = raw_line($linenr, 0); - for (my $count = $linenr + 1; $count <= $lc; $count++) { - $stat_real = $stat_real . "\n" . raw_line($count, 0); - } + my $stat_real = get_stat_real($linenr, $lc); WARN("NAKED_SSCANF", "unchecked sscanf return value\n" . "$here\n$stat_real\n"); } @@ -5932,10 +5981,7 @@ sub process { $line =~ /\bsscanf\b/) { my $lc = $stat =~ tr@\n@@; $lc = $lc + $linenr; - my $stat_real = raw_line($linenr, 0); - for (my $count = $linenr + 1; $count <= $lc; $count++) { - $stat_real = $stat_real . "\n" . raw_line($count, 0); - } + my $stat_real = get_stat_real($linenr, $lc); if ($stat_real =~ /\bsscanf\b\s*\(\s*$FuncArg\s*,\s*("[^"]+")/) { my $format = $6; my $count = $format =~ tr@%@%@; @@ -6065,12 +6111,9 @@ sub process { } if ($r1 !~ /^sizeof\b/ && $r2 =~ /^sizeof\s*\S/ && !($r1 =~ /^$Constant$/ || $r1 =~ /^[A-Z_][A-Z0-9_]*$/)) { - my $ctx = ''; - my $herectx = $here . "\n"; my $cnt = statement_rawlines($stat); - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } + my $herectx = get_stat_here($linenr, $cnt, $here); + if (WARN("ALLOC_WITH_MULTIPLY", "Prefer $newfunc over $oldfunc with multiply\n" . $herectx) && $cnt == 1 && @@ -6153,12 +6196,9 @@ sub process { if ($^V && $^V ge 5.10.0 && defined $stat && $stat =~ /^\+[$;\s]*(?:case[$;\s]+\w+[$;\s]*:[$;\s]*|)*[$;\s]*\bdefault[$;\s]*:[$;\s]*;/g) { - my $ctx = ''; - my $herectx = $here . "\n"; my $cnt = statement_rawlines($stat); - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } + my $herectx = get_stat_here($linenr, $cnt, $here); + WARN("DEFAULT_NO_BREAK", "switch default: should use break\n" . $herectx); } @@ -6211,6 +6251,12 @@ sub process { } } +# check for bool bitfields + if ($sline =~ /^.\s+bool\s*$Ident\s*:\s*\d+\s*;/) { + WARN("BOOL_BITFIELD", + "Avoid using bool as bitfield. Prefer bool bitfields as unsigned int or u<8|16|32>\n" . $herecurr); + } + # check for semaphores initialized locked if ($line =~ /^.\s*sema_init.+,\W?0\W?\)/) { WARN("CONSIDER_COMPLETION", @@ -6369,10 +6415,7 @@ sub process { my $lc = $stat =~ tr@\n@@; $lc = $lc + $linenr; - my $stat_real = raw_line($linenr, 0); - for (my $count = $linenr + 1; $count <= $lc; $count++) { - $stat_real = $stat_real . "\n" . raw_line($count, 0); - } + my $stat_real = get_stat_real($linenr, $lc); my $skip_args = ""; if ($arg_pos > 1) { @@ -6398,7 +6441,7 @@ sub process { } # check for uses of S_<PERMS> that could be octal for readability - if ($line =~ /\b($multi_mode_perms_string_search)\b/) { + while ($line =~ m{\b($multi_mode_perms_string_search)\b}g) { my $oval = $1; my $octal = perms_to_octal($oval); if (WARN("SYMBOLIC_PERMS", diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 1eeb70e439d7..4cafe6a19167 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6006,6 +6006,7 @@ static int selinux_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd) SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL); case IPC_STAT: case MSG_STAT: + case MSG_STAT_ANY: perms = MSGQ__GETATTR | MSGQ__ASSOCIATE; break; case IPC_SET: @@ -6157,6 +6158,7 @@ static int selinux_shm_shmctl(struct kern_ipc_perm *shp, int cmd) SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL); case IPC_STAT: case SHM_STAT: + case SHM_STAT_ANY: perms = SHM__GETATTR | SHM__ASSOCIATE; break; case IPC_SET: @@ -6272,6 +6274,7 @@ static int selinux_sem_semctl(struct kern_ipc_perm *sma, int cmd) break; case IPC_STAT: case SEM_STAT: + case SEM_STAT_ANY: perms = SEM__GETATTR | SEM__ASSOCIATE; break; default: diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 73549007bf9e..0b414836bebd 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -3046,6 +3046,7 @@ static int smack_shm_shmctl(struct kern_ipc_perm *isp, int cmd) switch (cmd) { case IPC_STAT: case SHM_STAT: + case SHM_STAT_ANY: may = MAY_READ; break; case IPC_SET: @@ -3139,6 +3140,7 @@ static int smack_sem_semctl(struct kern_ipc_perm *isp, int cmd) case GETALL: case IPC_STAT: case SEM_STAT: + case SEM_STAT_ANY: may = MAY_READ; break; case SETVAL: @@ -3228,6 +3230,7 @@ static int smack_msg_queue_msgctl(struct kern_ipc_perm *isp, int cmd) switch (cmd) { case IPC_STAT: case MSG_STAT: + case MSG_STAT_ANY: may = MAY_READ; break; case IPC_SET: diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h index 4ed569fcb139..b21b586b9854 100644 --- a/tools/include/linux/spinlock.h +++ b/tools/include/linux/spinlock.h @@ -7,6 +7,7 @@ #define spinlock_t pthread_mutex_t #define DEFINE_SPINLOCK(x) pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER; +#define __SPIN_LOCK_UNLOCKED(x) (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER #define spin_lock_irqsave(x, f) (void)f, pthread_mutex_lock(x) #define spin_unlock_irqrestore(x, f) (void)f, pthread_mutex_unlock(x) diff --git a/tools/testing/radix-tree/linux/gfp.h b/tools/testing/radix-tree/linux/gfp.h index e3201ccf54c3..32159c08a52e 100644 --- a/tools/testing/radix-tree/linux/gfp.h +++ b/tools/testing/radix-tree/linux/gfp.h @@ -19,6 +19,7 @@ #define __GFP_RECLAIM (__GFP_DIRECT_RECLAIM|__GFP_KSWAPD_RECLAIM) +#define GFP_ZONEMASK 0x0fu #define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 2fc410bc4f33..32aafa92074c 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -25,6 +25,7 @@ TARGETS += mqueue TARGETS += net TARGETS += nsfs TARGETS += powerpc +TARGETS += proc TARGETS += pstore TARGETS += ptrace TARGETS += seccomp diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore new file mode 100644 index 000000000000..6c16f77c722c --- /dev/null +++ b/tools/testing/selftests/proc/.gitignore @@ -0,0 +1,8 @@ +/proc-loadavg-001 +/proc-self-map-files-001 +/proc-self-map-files-002 +/proc-self-syscall +/proc-self-wchan +/proc-uptime-001 +/proc-uptime-002 +/read diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile new file mode 100644 index 000000000000..dbb87e56264c --- /dev/null +++ b/tools/testing/selftests/proc/Makefile @@ -0,0 +1,13 @@ +CFLAGS += -Wall -O2 + +TEST_GEN_PROGS := +TEST_GEN_PROGS += proc-loadavg-001 +TEST_GEN_PROGS += proc-self-map-files-001 +TEST_GEN_PROGS += proc-self-map-files-002 +TEST_GEN_PROGS += proc-self-syscall +TEST_GEN_PROGS += proc-self-wchan +TEST_GEN_PROGS += proc-uptime-001 +TEST_GEN_PROGS += proc-uptime-002 +TEST_GEN_PROGS += read + +include ../lib.mk diff --git a/tools/testing/selftests/proc/config b/tools/testing/selftests/proc/config new file mode 100644 index 000000000000..68fbd2b35884 --- /dev/null +++ b/tools/testing/selftests/proc/config @@ -0,0 +1 @@ +CONFIG_PROC_FS=y diff --git a/tools/testing/selftests/proc/proc-loadavg-001.c b/tools/testing/selftests/proc/proc-loadavg-001.c new file mode 100644 index 000000000000..e38ad6d94d4b --- /dev/null +++ b/tools/testing/selftests/proc/proc-loadavg-001.c @@ -0,0 +1,83 @@ +/* + * Copyright _ 2018 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* Test that /proc/loadavg correctly reports last pid in pid namespace. */ +#define _GNU_SOURCE +#include <errno.h> +#include <sched.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/wait.h> + +int main(void) +{ + pid_t pid; + int wstatus; + + if (unshare(CLONE_NEWPID) == -1) { + if (errno == ENOSYS || errno == EPERM) + return 2; + return 1; + } + + pid = fork(); + if (pid == -1) + return 1; + if (pid == 0) { + char buf[128], *p; + int fd; + ssize_t rv; + + fd = open("/proc/loadavg" , O_RDONLY); + if (fd == -1) + return 1; + rv = read(fd, buf, sizeof(buf)); + if (rv < 3) + return 1; + p = buf + rv; + + /* pid 1 */ + if (!(p[-3] == ' ' && p[-2] == '1' && p[-1] == '\n')) + return 1; + + pid = fork(); + if (pid == -1) + return 1; + if (pid == 0) + return 0; + if (waitpid(pid, NULL, 0) == -1) + return 1; + + lseek(fd, 0, SEEK_SET); + rv = read(fd, buf, sizeof(buf)); + if (rv < 3) + return 1; + p = buf + rv; + + /* pid 2 */ + if (!(p[-3] == ' ' && p[-2] == '2' && p[-1] == '\n')) + return 1; + + return 0; + } + + if (waitpid(pid, &wstatus, 0) == -1) + return 1; + if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) + return 0; + return 1; +} diff --git a/tools/testing/selftests/proc/proc-self-map-files-001.c b/tools/testing/selftests/proc/proc-self-map-files-001.c new file mode 100644 index 000000000000..af1d0a6af810 --- /dev/null +++ b/tools/testing/selftests/proc/proc-self-map-files-001.c @@ -0,0 +1,82 @@ +/* + * Copyright _ 2018 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* Test readlink /proc/self/map_files/... */ +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/mman.h> +#include <stdlib.h> + +static void pass(const char *fmt, unsigned long a, unsigned long b) +{ + char name[64]; + char buf[64]; + + snprintf(name, sizeof(name), fmt, a, b); + if (readlink(name, buf, sizeof(buf)) == -1) + exit(1); +} + +static void fail(const char *fmt, unsigned long a, unsigned long b) +{ + char name[64]; + char buf[64]; + + snprintf(name, sizeof(name), fmt, a, b); + if (readlink(name, buf, sizeof(buf)) == -1 && errno == ENOENT) + return; + exit(1); +} + +int main(void) +{ + const unsigned int PAGE_SIZE = sysconf(_SC_PAGESIZE); + void *p; + int fd; + unsigned long a, b; + + fd = open("/dev/zero", O_RDONLY); + if (fd == -1) + return 1; + + p = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE, fd, 0); + if (p == MAP_FAILED) + return 1; + + a = (unsigned long)p; + b = (unsigned long)p + PAGE_SIZE; + + pass("/proc/self/map_files/%lx-%lx", a, b); + fail("/proc/self/map_files/ %lx-%lx", a, b); + fail("/proc/self/map_files/%lx -%lx", a, b); + fail("/proc/self/map_files/%lx- %lx", a, b); + fail("/proc/self/map_files/%lx-%lx ", a, b); + fail("/proc/self/map_files/0%lx-%lx", a, b); + fail("/proc/self/map_files/%lx-0%lx", a, b); + if (sizeof(long) == 4) { + fail("/proc/self/map_files/100000000%lx-%lx", a, b); + fail("/proc/self/map_files/%lx-100000000%lx", a, b); + } else if (sizeof(long) == 8) { + fail("/proc/self/map_files/10000000000000000%lx-%lx", a, b); + fail("/proc/self/map_files/%lx-10000000000000000%lx", a, b); + } else + return 1; + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-self-map-files-002.c b/tools/testing/selftests/proc/proc-self-map-files-002.c new file mode 100644 index 000000000000..aebf4be56111 --- /dev/null +++ b/tools/testing/selftests/proc/proc-self-map-files-002.c @@ -0,0 +1,85 @@ +/* + * Copyright _ 2018 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* Test readlink /proc/self/map_files/... with address 0. */ +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/mman.h> +#include <stdlib.h> + +static void pass(const char *fmt, unsigned long a, unsigned long b) +{ + char name[64]; + char buf[64]; + + snprintf(name, sizeof(name), fmt, a, b); + if (readlink(name, buf, sizeof(buf)) == -1) + exit(1); +} + +static void fail(const char *fmt, unsigned long a, unsigned long b) +{ + char name[64]; + char buf[64]; + + snprintf(name, sizeof(name), fmt, a, b); + if (readlink(name, buf, sizeof(buf)) == -1 && errno == ENOENT) + return; + exit(1); +} + +int main(void) +{ + const unsigned int PAGE_SIZE = sysconf(_SC_PAGESIZE); + void *p; + int fd; + unsigned long a, b; + + fd = open("/dev/zero", O_RDONLY); + if (fd == -1) + return 1; + + p = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE|MAP_FIXED, fd, 0); + if (p == MAP_FAILED) { + if (errno == EPERM) + return 2; + return 1; + } + + a = (unsigned long)p; + b = (unsigned long)p + PAGE_SIZE; + + pass("/proc/self/map_files/%lx-%lx", a, b); + fail("/proc/self/map_files/ %lx-%lx", a, b); + fail("/proc/self/map_files/%lx -%lx", a, b); + fail("/proc/self/map_files/%lx- %lx", a, b); + fail("/proc/self/map_files/%lx-%lx ", a, b); + fail("/proc/self/map_files/0%lx-%lx", a, b); + fail("/proc/self/map_files/%lx-0%lx", a, b); + if (sizeof(long) == 4) { + fail("/proc/self/map_files/100000000%lx-%lx", a, b); + fail("/proc/self/map_files/%lx-100000000%lx", a, b); + } else if (sizeof(long) == 8) { + fail("/proc/self/map_files/10000000000000000%lx-%lx", a, b); + fail("/proc/self/map_files/%lx-10000000000000000%lx", a, b); + } else + return 1; + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-self-syscall.c b/tools/testing/selftests/proc/proc-self-syscall.c new file mode 100644 index 000000000000..05eb6f91f1e9 --- /dev/null +++ b/tools/testing/selftests/proc/proc-self-syscall.c @@ -0,0 +1,45 @@ +#define _GNU_SOURCE +#include <unistd.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <errno.h> +#include <unistd.h> +#include <string.h> +#include <stdio.h> + +static inline ssize_t sys_read(int fd, void *buf, size_t len) +{ + return syscall(SYS_read, fd, buf, len); +} + +int main(void) +{ + char buf1[64]; + char buf2[64]; + int fd; + ssize_t rv; + + fd = open("/proc/self/syscall", O_RDONLY); + if (fd == -1) { + if (errno == ENOENT) + return 2; + return 1; + } + + /* Do direct system call as libc can wrap anything. */ + snprintf(buf1, sizeof(buf1), "%ld 0x%lx 0x%lx 0x%lx", + (long)SYS_read, (long)fd, (long)buf2, (long)sizeof(buf2)); + + memset(buf2, 0, sizeof(buf2)); + rv = sys_read(fd, buf2, sizeof(buf2)); + if (rv < 0) + return 1; + if (rv < strlen(buf1)) + return 1; + if (strncmp(buf1, buf2, strlen(buf1)) != 0) + return 1; + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-self-wchan.c b/tools/testing/selftests/proc/proc-self-wchan.c new file mode 100644 index 000000000000..b8d8728a6869 --- /dev/null +++ b/tools/testing/selftests/proc/proc-self-wchan.c @@ -0,0 +1,25 @@ +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <errno.h> +#include <unistd.h> + +int main(void) +{ + char buf[64]; + int fd; + + fd = open("/proc/self/wchan", O_RDONLY); + if (fd == -1) { + if (errno == ENOENT) + return 2; + return 1; + } + + buf[0] = '\0'; + if (read(fd, buf, sizeof(buf)) != 1) + return 1; + if (buf[0] != '0') + return 1; + return 0; +} diff --git a/tools/testing/selftests/proc/proc-uptime-001.c b/tools/testing/selftests/proc/proc-uptime-001.c new file mode 100644 index 000000000000..303f26092306 --- /dev/null +++ b/tools/testing/selftests/proc/proc-uptime-001.c @@ -0,0 +1,45 @@ +/* + * Copyright _ 2018 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +// Test that values in /proc/uptime increment monotonically. +#undef NDEBUG +#include <assert.h> +#include <stdint.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +#include "proc-uptime.h" + +int main(void) +{ + uint64_t start, u0, u1, i0, i1; + int fd; + + fd = open("/proc/uptime", O_RDONLY); + assert(fd >= 0); + + proc_uptime(fd, &u0, &i0); + start = u0; + do { + proc_uptime(fd, &u1, &i1); + assert(u1 >= u0); + assert(i1 >= i0); + u0 = u1; + i0 = i1; + } while (u1 - start < 100); + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-uptime-002.c b/tools/testing/selftests/proc/proc-uptime-002.c new file mode 100644 index 000000000000..0cb79e1f1674 --- /dev/null +++ b/tools/testing/selftests/proc/proc-uptime-002.c @@ -0,0 +1,79 @@ +/* + * Copyright _ 2018 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +// Test that values in /proc/uptime increment monotonically +// while shifting across CPUs. +#define _GNU_SOURCE +#undef NDEBUG +#include <assert.h> +#include <unistd.h> +#include <sys/syscall.h> +#include <stdlib.h> +#include <string.h> + +#include <stdint.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +#include "proc-uptime.h" + +static inline int sys_sched_getaffinity(pid_t pid, unsigned int len, unsigned long *m) +{ + return syscall(SYS_sched_getaffinity, pid, len, m); +} + +static inline int sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long *m) +{ + return syscall(SYS_sched_setaffinity, pid, len, m); +} + +int main(void) +{ + unsigned int len; + unsigned long *m; + unsigned int cpu; + uint64_t u0, u1, i0, i1; + int fd; + + /* find out "nr_cpu_ids" */ + m = NULL; + len = 0; + do { + len += sizeof(unsigned long); + free(m); + m = malloc(len); + } while (sys_sched_getaffinity(0, len, m) == -EINVAL); + + fd = open("/proc/uptime", O_RDONLY); + assert(fd >= 0); + + proc_uptime(fd, &u0, &i0); + for (cpu = 0; cpu < len * 8; cpu++) { + memset(m, 0, len); + m[cpu / (8 * sizeof(unsigned long))] |= 1UL << (cpu % (8 * sizeof(unsigned long))); + + /* CPU might not exist, ignore error */ + sys_sched_setaffinity(0, len, m); + + proc_uptime(fd, &u1, &i1); + assert(u1 >= u0); + assert(i1 >= i0); + u0 = u1; + i0 = i1; + } + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-uptime.h b/tools/testing/selftests/proc/proc-uptime.h new file mode 100644 index 000000000000..d584419f50a7 --- /dev/null +++ b/tools/testing/selftests/proc/proc-uptime.h @@ -0,0 +1,74 @@ +/* + * Copyright _ 2018 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#undef NDEBUG +#include <assert.h> +#include <errno.h> +#include <string.h> +#include <stdlib.h> +#include <unistd.h> + +static unsigned long long xstrtoull(const char *p, char **end) +{ + if (*p == '0') { + *end = (char *)p + 1; + return 0; + } else if ('1' <= *p && *p <= '9') { + unsigned long long val; + + errno = 0; + val = strtoull(p, end, 10); + assert(errno == 0); + return val; + } else + assert(0); +} + +static void proc_uptime(int fd, uint64_t *uptime, uint64_t *idle) +{ + uint64_t val1, val2; + char buf[64], *p; + ssize_t rv; + + /* save "p < end" checks */ + memset(buf, 0, sizeof(buf)); + rv = pread(fd, buf, sizeof(buf), 0); + assert(0 <= rv && rv <= sizeof(buf)); + buf[sizeof(buf) - 1] = '\0'; + + p = buf; + + val1 = xstrtoull(p, &p); + assert(p[0] == '.'); + assert('0' <= p[1] && p[1] <= '9'); + assert('0' <= p[2] && p[2] <= '9'); + assert(p[3] == ' '); + + val2 = (p[1] - '0') * 10 + p[2] - '0'; + *uptime = val1 * 100 + val2; + + p += 4; + + val1 = xstrtoull(p, &p); + assert(p[0] == '.'); + assert('0' <= p[1] && p[1] <= '9'); + assert('0' <= p[2] && p[2] <= '9'); + assert(p[3] == '\n'); + + val2 = (p[1] - '0') * 10 + p[2] - '0'; + *idle = val1 * 100 + val2; + + assert(p + 4 == buf + rv); +} diff --git a/tools/testing/selftests/proc/read.c b/tools/testing/selftests/proc/read.c new file mode 100644 index 000000000000..12e397f78592 --- /dev/null +++ b/tools/testing/selftests/proc/read.c @@ -0,0 +1,147 @@ +/* + * Copyright _ 2018 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +// Test +// 1) read of every file in /proc +// 2) readlink of every symlink in /proc +// 3) recursively (1) + (2) for every directory in /proc +// 4) write to /proc/*/clear_refs and /proc/*/task/*/clear_refs +// 5) write to /proc/sysrq-trigger +#undef NDEBUG +#include <assert.h> +#include <errno.h> +#include <sys/types.h> +#include <dirent.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> + +static inline bool streq(const char *s1, const char *s2) +{ + return strcmp(s1, s2) == 0; +} + +static struct dirent *xreaddir(DIR *d) +{ + struct dirent *de; + + errno = 0; + de = readdir(d); + if (!de && errno != 0) { + exit(1); + } + return de; +} + +static void f_reg(DIR *d, const char *filename) +{ + char buf[4096]; + int fd; + ssize_t rv; + + /* read from /proc/kmsg can block */ + fd = openat(dirfd(d), filename, O_RDONLY|O_NONBLOCK); + if (fd == -1) + return; + rv = read(fd, buf, sizeof(buf)); + assert((0 <= rv && rv <= sizeof(buf)) || rv == -1); + close(fd); +} + +static void f_reg_write(DIR *d, const char *filename, const char *buf, size_t len) +{ + int fd; + ssize_t rv; + + fd = openat(dirfd(d), filename, O_WRONLY); + if (fd == -1) + return; + rv = write(fd, buf, len); + assert((0 <= rv && rv <= len) || rv == -1); + close(fd); +} + +static void f_lnk(DIR *d, const char *filename) +{ + char buf[4096]; + ssize_t rv; + + rv = readlinkat(dirfd(d), filename, buf, sizeof(buf)); + assert((0 <= rv && rv <= sizeof(buf)) || rv == -1); +} + +static void f(DIR *d, unsigned int level) +{ + struct dirent *de; + + de = xreaddir(d); + assert(de->d_type == DT_DIR); + assert(streq(de->d_name, ".")); + + de = xreaddir(d); + assert(de->d_type == DT_DIR); + assert(streq(de->d_name, "..")); + + while ((de = xreaddir(d))) { + assert(!streq(de->d_name, ".")); + assert(!streq(de->d_name, "..")); + + switch (de->d_type) { + DIR *dd; + int fd; + + case DT_REG: + if (level == 0 && streq(de->d_name, "sysrq-trigger")) { + f_reg_write(d, de->d_name, "h", 1); + } else if (level == 1 && streq(de->d_name, "clear_refs")) { + f_reg_write(d, de->d_name, "1", 1); + } else if (level == 3 && streq(de->d_name, "clear_refs")) { + f_reg_write(d, de->d_name, "1", 1); + } else { + f_reg(d, de->d_name); + } + break; + case DT_DIR: + fd = openat(dirfd(d), de->d_name, O_DIRECTORY|O_RDONLY); + if (fd == -1) + continue; + dd = fdopendir(fd); + if (!dd) + continue; + f(dd, level + 1); + closedir(dd); + break; + case DT_LNK: + f_lnk(d, de->d_name); + break; + default: + assert(0); + } + } +} + +int main(void) +{ + DIR *d; + + d = opendir("/proc"); + if (!d) + return 2; + f(d, 0); + return 0; +} |