// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 2009  Red Hat, Inc.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/shrinker.h>
#include <linux/mm_inline.h>
#include <linux/swapops.h>
#include <linux/backing-dev.h>
#include <linux/dax.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/pfn_t.h>
#include <linux/mman.h>
#include <linux/memremap.h>
#include <linux/pagemap.h>
#include <linux/debugfs.h>
#include <linux/migrate.h>
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/page_owner.h>
#include <linux/sched/sysctl.h>
#include <linux/memory-tiers.h>
#include <linux/compat.h>

#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
#include "swap.h"

#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>

/*
 * By default, transparent hugepage support is disabled in order to avoid
 * risking an increased memory footprint for applications that are not
 * guaranteed to benefit from it. When transparent hugepage support is
 * enabled, it is for all mappings, and khugepaged scans all mappings.
 * Defrag is invoked by khugepaged hugepage allocations and by page faults
 * for all hugepage allocations.
 */
unsigned long transparent_hugepage_flags __read_mostly =
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
	(1<<TRANSPARENT_HUGEPAGE_FLAG)|
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
#endif
	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);

static struct shrinker deferred_split_shrinker;

static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
unsigned long huge_anon_orders_always __read_mostly;
unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;
unsigned long huge_pcp_allow_orders __read_mostly;

unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
					 unsigned long vm_flags, bool smaps,
					 bool in_pf, bool enforce_sysfs,
					 unsigned long orders)
{
	/* Check the intersection of requested and supported orders. */
	orders &= vma_is_anonymous(vma) ?
			THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
	if (!orders)
		return 0;

	if (!vma->vm_mm)		/* vdso */
		return 0;

	/*
	 * Explicitly disabled through madvise or prctl, or some
	 * architectures may disable THP for some mappings, for
	 * example, s390 kvm.
	 * */
	if ((vm_flags & VM_NOHUGEPAGE) ||
	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
		return 0;
	/*
	 * If the hardware/firmware marked hugepage support disabled.
	 */
	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
		return 0;

	/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
	if (vma_is_dax(vma))
		return in_pf ? orders : 0;

	/*
	 * khugepaged special VMA and hugetlb VMA.
	 * Must be checked after dax since some dax mappings may have
	 * VM_MIXEDMAP set.
	 */
	if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
		return 0;

	/*
	 * Check alignment for file vma and size for both file and anon vma by
	 * filtering out the unsuitable orders.
	 *
	 * Skip the check for page fault. Huge fault does the check in fault
	 * handlers.
	 */
	if (!in_pf) {
		int order = highest_order(orders);
		unsigned long addr;

		while (orders) {
			addr = vma->vm_end - (PAGE_SIZE << order);
			if (thp_vma_suitable_order(vma, addr, order))
				break;
			order = next_order(&orders, order);
		}

		if (!orders)
			return 0;
	}

	/*
	 * Enabled via shmem mount options or sysfs settings.
	 * Must be done before hugepage flags check since shmem has its
	 * own flags.
	 */
	if (!in_pf && shmem_file(vma->vm_file))
		return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
				     !enforce_sysfs, vma->vm_mm, vm_flags)
			? orders : 0;

	if (!vma_is_anonymous(vma)) {
		/*
		 * Enforce sysfs THP requirements as necessary. Anonymous vmas
		 * were already handled in thp_vma_allowable_orders().
		 */
		if (enforce_sysfs &&
		    (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
						    !hugepage_global_always())))
			return 0;

		/*
		 * Trust that ->huge_fault() handlers know what they are doing
		 * in fault path.
		 */
		if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
			return orders;
		/* Only regular file is valid in collapse path */
		if (((!in_pf || smaps)) && file_thp_enabled(vma))
			return orders;
		return 0;
	}

	if (vma_is_temporary_stack(vma))
		return 0;

	/*
	 * THPeligible bit of smaps should show 1 for proper VMAs even
	 * though anon_vma is not initialized yet.
	 *
	 * Allow page fault since anon_vma may be not initialized until
	 * the first page fault.
	 */
	if (!vma->anon_vma)
		return (smaps || in_pf) ? orders : 0;

	return orders;
}

static bool get_huge_zero_page(void)
{
	struct page *zero_page;
retry:
	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
		return true;

	zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
			HPAGE_PMD_ORDER);
	if (!zero_page) {
		count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
		return false;
	}
	preempt_disable();
	if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
		preempt_enable();
		__free_pages(zero_page, compound_order(zero_page));
		goto retry;
	}
	WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));

	/* We take additional reference here. It will be put back by shrinker */
	atomic_set(&huge_zero_refcount, 2);
	preempt_enable();
	count_vm_event(THP_ZERO_PAGE_ALLOC);
	return true;
}

static void put_huge_zero_page(void)
{
	/*
	 * Counter should never go to zero here. Only shrinker can put
	 * last reference.
	 */
	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}

struct page *mm_get_huge_zero_page(struct mm_struct *mm)
{
	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
		return READ_ONCE(huge_zero_page);

	if (!get_huge_zero_page())
		return NULL;

	if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
		put_huge_zero_page();

	return READ_ONCE(huge_zero_page);
}

void mm_put_huge_zero_page(struct mm_struct *mm)
{
	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
		put_huge_zero_page();
}

static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
					struct shrink_control *sc)
{
	/* we can free zero page only if last reference remains */
	return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
}

static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
				       struct shrink_control *sc)
{
	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
		struct page *zero_page = xchg(&huge_zero_page, NULL);
		BUG_ON(zero_page == NULL);
		WRITE_ONCE(huge_zero_pfn, ~0UL);
		__free_pages(zero_page, compound_order(zero_page));
		return HPAGE_PMD_NR;
	}

	return 0;
}

static struct shrinker huge_zero_page_shrinker = {
	.count_objects = shrink_huge_zero_page_count,
	.scan_objects = shrink_huge_zero_page_scan,
	.seeks = DEFAULT_SEEKS,
};

#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
			    struct kobj_attribute *attr, char *buf)
{
	const char *output;

	if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
		output = "[always] madvise never";
	else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
			  &transparent_hugepage_flags))
		output = "always [madvise] never";
	else
		output = "always madvise [never]";

	return sysfs_emit(buf, "%s\n", output);
}

static ssize_t enabled_store(struct kobject *kobj,
			     struct kobj_attribute *attr,
			     const char *buf, size_t count)
{
	ssize_t ret = count;

	if (sysfs_streq(buf, "always")) {
		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
		set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
	} else if (sysfs_streq(buf, "madvise")) {
		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
	} else if (sysfs_streq(buf, "never")) {
		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
	} else
		ret = -EINVAL;

	if (ret > 0) {
		int err = start_stop_khugepaged();
		if (err)
			ret = err;
	}
	return ret;
}

static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);

ssize_t single_hugepage_flag_show(struct kobject *kobj,
				  struct kobj_attribute *attr, char *buf,
				  enum transparent_hugepage_flag flag)
{
	return sysfs_emit(buf, "%d\n",
			  !!test_bit(flag, &transparent_hugepage_flags));
}

ssize_t single_hugepage_flag_store(struct kobject *kobj,
				 struct kobj_attribute *attr,
				 const char *buf, size_t count,
				 enum transparent_hugepage_flag flag)
{
	unsigned long value;
	int ret;

	ret = kstrtoul(buf, 10, &value);
	if (ret < 0)
		return ret;
	if (value > 1)
		return -EINVAL;

	if (value)
		set_bit(flag, &transparent_hugepage_flags);
	else
		clear_bit(flag, &transparent_hugepage_flags);

	return count;
}

static ssize_t defrag_show(struct kobject *kobj,
			   struct kobj_attribute *attr, char *buf)
{
	const char *output;

	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
		     &transparent_hugepage_flags))
		output = "[always] defer defer+madvise madvise never";
	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
			  &transparent_hugepage_flags))
		output = "always [defer] defer+madvise madvise never";
	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
			  &transparent_hugepage_flags))
		output = "always defer [defer+madvise] madvise never";
	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
			  &transparent_hugepage_flags))
		output = "always defer defer+madvise [madvise] never";
	else
		output = "always defer defer+madvise madvise [never]";

	return sysfs_emit(buf, "%s\n", output);
}


#ifndef pmd_move_must_withdraw
static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
					 spinlock_t *old_pmd_ptl,
					 struct vm_area_struct *vma)
{
	/*
	 * With split pmd lock we also need to move preallocated
	 * PTE page table if new_pmd is on different PMD page table.
	 *
	 * We also don't deposit and withdraw tables for file pages.
	 */
	return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
}
#endif

static pmd_t move_soft_dirty_pmd(pmd_t pmd)
{
#ifdef CONFIG_MEM_SOFT_DIRTY
	if (unlikely(is_pmd_migration_entry(pmd)))
		pmd = pmd_swp_mksoft_dirty(pmd);
	else if (pmd_present(pmd))
		pmd = pmd_mksoft_dirty(pmd);
#endif
	return pmd;
}

bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
	spinlock_t *old_ptl, *new_ptl;
	pmd_t pmd;
	struct mm_struct *mm = vma->vm_mm;
	bool force_flush = false;

	/*
	 * The destination pmd shouldn't be established, free_pgtables()
	 * should have released it; but move_page_tables() might have already
	 * inserted a page table, if racing against shmem/file collapse.
	 */
	if (!pmd_none(*new_pmd)) {
		VM_BUG_ON(pmd_trans_huge(*new_pmd));
		return false;
	}

	/*
	 * We don't have to worry about the ordering of src and dst
	 * ptlocks because exclusive mmap_lock prevents deadlock.
	 */
	old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
	if (old_ptl) {
		new_ptl = pmd_lockptr(mm, new_pmd);
		if (new_ptl != old_ptl)
			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
		pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
		if (pmd_present(pmd))
			force_flush = true;
		VM_BUG_ON(!pmd_none(*new_pmd));

		if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
			pgtable_t pgtable;
			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
		}
		pmd = move_soft_dirty_pmd(pmd);
		set_pmd_at(mm, new_addr, new_pmd, pmd);
		if (force_flush)
			flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
		if (new_ptl != old_ptl)
			spin_unlock(new_ptl);
		spin_unlock(old_ptl);
		return true;
	}
	return false;
}

/*
 * Returns
 *  - 0 if PMD could not be locked
 *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
 *      or if prot_numa but THP migration is not supported
 *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
 */
int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
		    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
		    unsigned long cp_flags)
{
	struct mm_struct *mm = vma->vm_mm;
	spinlock_t *ptl;
	pmd_t oldpmd, entry;
	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
	int ret = 1;

	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);

	if (prot_numa && !thp_migration_supported())
		return 1;

	ptl = __pmd_trans_huge_lock(pmd, vma);
	if (!ptl)
		return 0;

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
	if (is_swap_pmd(*pmd)) {
		swp_entry_t entry = pmd_to_swp_entry(*pmd);
		struct folio *folio = pfn_swap_entry_folio(entry);
		pmd_t newpmd;

		VM_BUG_ON(!is_pmd_migration_entry(*pmd));
		if (is_writable_migration_entry(entry)) {
			/*
			 * A protection check is difficult so
			 * just be safe and disable write
			 */
			if (folio_test_anon(folio))
				entry = make_readable_exclusive_migration_entry(swp_offset(entry));
			else
				entry = make_readable_migration_entry(swp_offset(entry));
			newpmd = swp_entry_to_pmd(entry);
			if (pmd_swp_soft_dirty(*pmd))
				newpmd = pmd_swp_mksoft_dirty(newpmd);
		} else {
			newpmd = *pmd;
		}

		if (uffd_wp)
			newpmd = pmd_swp_mkuffd_wp(newpmd);
		else if (uffd_wp_resolve)
			newpmd = pmd_swp_clear_uffd_wp(newpmd);
		if (!pmd_same(*pmd, newpmd))
			set_pmd_at(mm, addr, pmd, newpmd);
		goto unlock;
	}
#endif

	if (prot_numa) {
		struct folio *folio;
		bool toptier;
		/*
		 * Avoid trapping faults against the zero page. The read-only
		 * data is likely to be read-cached on the local CPU and
		 * local/remote hits to the zero page are not interesting.
		 */
		if (is_huge_zero_pmd(*pmd))
			goto unlock;

		if (pmd_protnone(*pmd))
			goto unlock;

		folio = page_folio(pmd_page(*pmd));
		toptier = node_is_toptier(folio_nid(folio));
		/*
		 * Skip scanning top tier node if normal numa
		 * balancing is disabled
		 */
		if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
		    toptier)
			goto unlock;

		if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
		    !toptier)
			folio_xchg_access_time(folio,
					       jiffies_to_msecs(jiffies));
	}
	/*
	 * In case prot_numa, we are under mmap_read_lock(mm). It's critical
	 * to not clear pmd intermittently to avoid race with MADV_DONTNEED
	 * which is also under mmap_read_lock(mm):
	 *
	 *	CPU0:				CPU1:
	 *				change_huge_pmd(prot_numa=1)
	 *				 pmdp_huge_get_and_clear_notify()
	 * madvise_dontneed()
	 *  zap_pmd_range()
	 *   pmd_trans_huge(*pmd) == 0 (without ptl)
	 *   // skip the pmd
	 *				 set_pmd_at();
	 *				 // pmd is re-established
	 *
	 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
	 * which may break userspace.
	 *
	 * pmdp_invalidate_ad() is required to make sure we don't miss
	 * dirty/young flags set by hardware.
	 */
	oldpmd = pmdp_invalidate_ad(vma, addr, pmd);

	entry = pmd_modify(oldpmd, newprot);
	if (uffd_wp)
		entry = pmd_mkuffd_wp(entry);
	else if (uffd_wp_resolve)
		/*
		 * Leave the write bit to be handled by PF interrupt
		 * handler, then things like COW could be properly
		 * handled.
		 */
		entry = pmd_clear_uffd_wp(entry);

	/* See change_pte_range(). */
	if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
	    can_change_pmd_writable(vma, addr, entry))
		entry = pmd_mkwrite(entry, vma);

	ret = HPAGE_PMD_NR;
	set_pmd_at(mm, addr, pmd, entry);

	if (huge_pmd_needs_flush(oldpmd, entry))
		tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
unlock:
	spin_unlock(ptl);
	return ret;
}

/*
 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
 *
 * Note that if it returns page table lock pointer, this routine returns without
 * unlocking page table lock. So callers must unlock it.
 */
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
	spinlock_t *ptl;
	ptl = pmd_lock(vma->vm_mm, pmd);
	if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
			pmd_devmap(*pmd)))
		return ptl;
	spin_unlock(ptl);
	return NULL;
}

/*
 * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
 *
 * Note that if it returns page table lock pointer, this routine returns without
 * unlocking page table lock. So callers must unlock it.
 */
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
{
	spinlock_t *ptl;

	ptl = pud_lock(vma->vm_mm, pud);
	if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
		return ptl;
	spin_unlock(ptl);
	return NULL;
}
EXPORT_SYMBOL_GPL(__pud_trans_huge_lock);

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
		 pud_t *pud, unsigned long addr)
{
	spinlock_t *ptl;

	ptl = __pud_trans_huge_lock(pud, vma);
	if (!ptl)
		return 0;

	pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
	tlb_remove_pud_tlb_entry(tlb, pud, addr);
	if (vma_is_special_huge(vma)) {
		spin_unlock(ptl);
		/* No zero page support yet */
	} else {
		/* No support for anonymous PUD pages yet */
		BUG();
	}
	return 1;
}

static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
		unsigned long haddr)
{
	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
	VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));

	count_vm_event(THP_SPLIT_PUD);

	pudp_huge_clear_flush(vma, haddr, pud);
}

void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
		unsigned long address)
{
	spinlock_t *ptl;
	struct mmu_notifier_range range;

	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
				address & HPAGE_PUD_MASK,
				(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
	mmu_notifier_invalidate_range_start(&range);
	ptl = pud_lock(vma->vm_mm, pud);
	if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
		goto out;
	__split_huge_pud_locked(vma, pud, range.start);

out:
	spin_unlock(ptl);
	mmu_notifier_invalidate_range_end(&range);
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
		unsigned long haddr, pmd_t *pmd)
{
	struct mm_struct *mm = vma->vm_mm;
	pgtable_t pgtable;
	pmd_t _pmd, old_pmd;
	unsigned long addr;
	pte_t *pte;
	int i;

	/*
	 * Leave pmd empty until pte is filled note that it is fine to delay
	 * notification until mmu_notifier_invalidate_range_end() as we are
	 * replacing a zero pmd write protected page with a zero pte write
	 * protected page.
	 *
	 * See Documentation/mm/mmu_notifier.rst
	 */
	old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);

	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
	pmd_populate(mm, &_pmd, pgtable);

	pte = pte_offset_map(&_pmd, haddr);
	VM_BUG_ON(!pte);
	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
		pte_t entry;

		entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
		entry = pte_mkspecial(entry);
		if (pmd_uffd_wp(old_pmd))
			entry = pte_mkuffd_wp(entry);
		VM_BUG_ON(!pte_none(ptep_get(pte)));
		set_pte_at(mm, addr, pte, entry);
		pte++;
	}
	pte_unmap(pte - 1);
	smp_wmb(); /* make pte visible before pmd */
	pmd_populate(mm, pmd, pgtable);
}

static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
		unsigned long haddr, bool freeze)
{
	struct mm_struct *mm = vma->vm_mm;
	struct folio *folio;
	struct page *page;
	pgtable_t pgtable;
	pmd_t old_pmd, _pmd;
	bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
	bool anon_exclusive = false, dirty = false;
	unsigned long addr;
	pte_t *pte;
	int i;

	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
				&& !pmd_devmap(*pmd));

	count_vm_event(THP_SPLIT_PMD);

	if (!vma_is_anonymous(vma)) {
		old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
		/*
		 * We are going to unmap this huge page. So
		 * just go ahead and zap it
		 */
		if (arch_needs_pgtable_deposit())
			zap_deposited_table(mm, pmd);
		if (vma_is_special_huge(vma))
			return;
		if (unlikely(is_pmd_migration_entry(old_pmd))) {
			swp_entry_t entry;

			entry = pmd_to_swp_entry(old_pmd);
			folio = pfn_swap_entry_folio(entry);
		} else {
			page = pmd_page(old_pmd);
			folio = page_folio(page);
			if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
				folio_mark_dirty(folio);
			if (!folio_test_referenced(folio) && pmd_young(old_pmd))
				folio_set_referenced(folio);
			add_reliable_page_counter(page, mm, -HPAGE_PMD_NR);
			folio_remove_rmap_pmd(folio, page, vma);
			folio_put(folio);
		}
		add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
		return;
	}

	if (is_huge_zero_pmd(*pmd)) {
		/*
		 * FIXME: Do we want to invalidate secondary mmu by calling
		 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
		 * inside __split_huge_pmd() ?
		 *
		 * We are going from a zero huge page write protected to zero
		 * small page also write protected so it does not seems useful
		 * to invalidate secondary mmu at this time.
		 */
		return __split_huge_zero_page_pmd(vma, haddr, pmd);
	}

	/*
	 * Up to this point the pmd is present and huge and userland has the
	 * whole access to the hugepage during the split (which happens in
	 * place). If we overwrite the pmd with the not-huge version pointing
	 * to the pte here (which of course we could if all CPUs were bug
	 * free), userland could trigger a small page size TLB miss on the
	 * small sized TLB while the hugepage TLB entry is still established in
	 * the huge TLB. Some CPU doesn't like that.
	 * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
	 * 383 on page 105. Intel should be safe but is also warns that it's
	 * only safe if the permission and cache attributes of the two entries
	 * loaded in the two TLB is identical (which should be the case here).
	 * But it is generally safer to never allow small and huge TLB entries
	 * for the same virtual address to be loaded simultaneously. So instead
	 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
	 * current pmd notpresent (atomically because here the pmd_trans_huge
	 * must remain set at all times on the pmd until the split is complete
	 * for this pmd), then we flush the SMP TLB and finally we write the
	 * non-huge version of the pmd entry with pmd_populate.
	 */
	old_pmd = pmdp_invalidate(vma, haddr, pmd);

	pmd_migration = is_pmd_migration_entry(old_pmd);
	if (unlikely(pmd_migration)) {
		swp_entry_t entry;

		entry = pmd_to_swp_entry(old_pmd);
		page = pfn_swap_entry_to_page(entry);
		write = is_writable_migration_entry(entry);
		if (PageAnon(page))
			anon_exclusive = is_readable_exclusive_migration_entry(entry);
		young = is_migration_entry_young(entry);
		dirty = is_migration_entry_dirty(entry);
		soft_dirty = pmd_swp_soft_dirty(old_pmd);
		uffd_wp = pmd_swp_uffd_wp(old_pmd);
	} else {
		page = pmd_page(old_pmd);
		folio = page_folio(page);
		if (pmd_dirty(old_pmd)) {
			dirty = true;
			folio_set_dirty(folio);
		}
		write = pmd_write(old_pmd);
		young = pmd_young(old_pmd);
		soft_dirty = pmd_soft_dirty(old_pmd);
		uffd_wp = pmd_uffd_wp(old_pmd);

		VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
		VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);

		/*
		 * Without "freeze", we'll simply split the PMD, propagating the
		 * PageAnonExclusive() flag for each PTE by setting it for
		 * each subpage -- no need to (temporarily) clear.
		 *
		 * With "freeze" we want to replace mapped pages by
		 * migration entries right away. This is only possible if we
		 * managed to clear PageAnonExclusive() -- see
		 * set_pmd_migration_entry().
		 *
		 * In case we cannot clear PageAnonExclusive(), split the PMD
		 * only and let try_to_migrate_one() fail later.
		 *
		 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
		 */
		anon_exclusive = PageAnonExclusive(page);
		if (freeze && anon_exclusive &&
		    folio_try_share_anon_rmap_pmd(folio, page))
			freeze = false;
		if (!freeze) {
			rmap_t rmap_flags = RMAP_NONE;

			folio_ref_add(folio, HPAGE_PMD_NR - 1);
			if (anon_exclusive)
				rmap_flags |= RMAP_EXCLUSIVE;
			folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
						 vma, haddr, rmap_flags);
		}
	}

	/*
	 * Withdraw the table only after we mark the pmd entry invalid.
	 * This's critical for some architectures (Power).
	 */
	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
	pmd_populate(mm, &_pmd, pgtable);

	pte = pte_offset_map(&_pmd, haddr);
	VM_BUG_ON(!pte);

	/*
	 * Note that NUMA hinting access restrictions are not transferred to
	 * avoid any possibility of altering permissions across VMAs.
	 */
	if (freeze || pmd_migration) {
		for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
			pte_t entry;
			swp_entry_t swp_entry;

			if (write)
				swp_entry = make_writable_migration_entry(
							page_to_pfn(page + i));
			else if (anon_exclusive)
				swp_entry = make_readable_exclusive_migration_entry(
							page_to_pfn(page + i));
			else
				swp_entry = make_readable_migration_entry(
							page_to_pfn(page + i));
			if (young)
				swp_entry = make_migration_entry_young(swp_entry);
			if (dirty)
				swp_entry = make_migration_entry_dirty(swp_entry);
			entry = swp_entry_to_pte(swp_entry);
			if (soft_dirty)
				entry = pte_swp_mksoft_dirty(entry);
			if (uffd_wp)
				entry = pte_swp_mkuffd_wp(entry);

			VM_WARN_ON(!pte_none(ptep_get(pte + i)));
			set_pte_at(mm, addr, pte + i, entry);
		}
	} else {
		pte_t entry;

		entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
		if (write)
			entry = pte_mkwrite(entry, vma);
		if (!young)
			entry = pte_mkold(entry);
		/* NOTE: this may set soft-dirty too on some archs */
		if (dirty)
			entry = pte_mkdirty(entry);
		if (soft_dirty)
			entry = pte_mksoft_dirty(entry);
		if (uffd_wp)
			entry = pte_mkuffd_wp(entry);

		for (i = 0; i < HPAGE_PMD_NR; i++)
			VM_WARN_ON(!pte_none(ptep_get(pte + i)));

		set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
	}
	pte_unmap(pte);

	if (!pmd_migration)
		folio_remove_rmap_pmd(folio, page, vma);
	if (freeze)
		put_page(page);

	smp_wmb(); /* make pte visible before pmd */
	pmd_populate(mm, pmd, pgtable);
}

void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
		unsigned long address, bool freeze, struct folio *folio)
{
	spinlock_t *ptl;
	struct mmu_notifier_range range;

	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
				address & HPAGE_PMD_MASK,
				(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
	mmu_notifier_invalidate_range_start(&range);
	ptl = pmd_lock(vma->vm_mm, pmd);

	/*
	 * If caller asks to setup a migration entry, we need a folio to check
	 * pmd against. Otherwise we can end up replacing wrong folio.
	 */
	VM_BUG_ON(freeze && !folio);
	VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));

	if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
	    is_pmd_migration_entry(*pmd)) {
		/*
		 * It's safe to call pmd_page when folio is set because it's
		 * guaranteed that pmd is present.
		 */
		if (folio && folio != page_folio(pmd_page(*pmd)))
			goto out;
		__split_huge_pmd_locked(vma, pmd, range.start, freeze);
	}

out:
	spin_unlock(ptl);
	mmu_notifier_invalidate_range_end(&range);
}

static const struct file_operations split_huge_pages_fops = {
	.owner	 = THIS_MODULE,
	.write	 = split_huge_pages_write,
	.llseek  = no_llseek,
};

static int __init split_huge_pages_debugfs(void)
{
	debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
			    &split_huge_pages_fops);
	return 0;
}
late_initcall(split_huge_pages_debugfs);
#endif

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
		struct page *page)
{
	struct folio *folio = page_folio(page);
	struct vm_area_struct *vma = pvmw->vma;
	struct mm_struct *mm = vma->vm_mm;
	unsigned long address = pvmw->address;
	bool anon_exclusive;
	pmd_t pmdval;
	swp_entry_t entry;
	pmd_t pmdswp;

	if (!(pvmw->pmd && !pvmw->pte))
		return 0;

	flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
	pmdval = pmdp_invalidate(vma, address, pvmw->pmd);

	/* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
	anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
	if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
		set_pmd_at(mm, address, pvmw->pmd, pmdval);
		return -EBUSY;
	}

	if (pmd_dirty(pmdval))
		folio_mark_dirty(folio);
	if (pmd_write(pmdval))
		entry = make_writable_migration_entry(page_to_pfn(page));
	else if (anon_exclusive)
		entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
	else
		entry = make_readable_migration_entry(page_to_pfn(page));
	if (pmd_young(pmdval))
		entry = make_migration_entry_young(entry);
	if (pmd_dirty(pmdval))
		entry = make_migration_entry_dirty(entry);
	pmdswp = swp_entry_to_pmd(entry);
	if (pmd_soft_dirty(pmdval))
		pmdswp = pmd_swp_mksoft_dirty(pmdswp);
	if (pmd_uffd_wp(pmdval))
		pmdswp = pmd_swp_mkuffd_wp(pmdswp);
	set_pmd_at(mm, address, pvmw->pmd, pmdswp);
	folio_remove_rmap_pmd(folio, page, vma);
	folio_put(folio);
	trace_set_migration_pmd(address, pmd_val(pmdswp));

	return 0;
}

void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
{
	struct folio *folio = page_folio(new);
	struct vm_area_struct *vma = pvmw->vma;
	struct mm_struct *mm = vma->vm_mm;
	unsigned long address = pvmw->address;
	unsigned long haddr = address & HPAGE_PMD_MASK;
	pmd_t pmde;
	swp_entry_t entry;

	if (!(pvmw->pmd && !pvmw->pte))
		return;

	entry = pmd_to_swp_entry(*pvmw->pmd);
	folio_get(folio);
	pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
	if (pmd_swp_soft_dirty(*pvmw->pmd))
		pmde = pmd_mksoft_dirty(pmde);
	if (is_writable_migration_entry(entry))
		pmde = pmd_mkwrite(pmde, vma);
	if (pmd_swp_uffd_wp(*pvmw->pmd))
		pmde = pmd_mkuffd_wp(pmde);
	if (!is_migration_entry_young(entry))
		pmde = pmd_mkold(pmde);
	/* NOTE: this may contain setting soft-dirty on some archs */
	if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
		pmde = pmd_mkdirty(pmde);

	if (folio_test_anon(folio)) {
		rmap_t rmap_flags = RMAP_NONE;

		if (!is_readable_migration_entry(entry))
			rmap_flags |= RMAP_EXCLUSIVE;

		folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);
	} else {
		folio_add_file_rmap_pmd(folio, new, vma);
	}
	VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new));
	set_pmd_at(mm, haddr, pvmw->pmd, pmde);

	/* No need to invalidate - it was non-present before */
	update_mmu_cache_pmd(vma, address, pvmw->pmd);
	trace_remove_migration_pmd(address, pmd_val(pmde));
}
#endif
