linux/drivers/base/memory.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Memory subsystem support
   4 *
   5 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
   6 *            Dave Hansen <haveblue@us.ibm.com>
   7 *
   8 * This file provides the necessary infrastructure to represent
   9 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
  10 * All arch-independent code that assumes MEMORY_HOTPLUG requires
  11 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
  12 */
  13
  14#include <linux/module.h>
  15#include <linux/init.h>
  16#include <linux/topology.h>
  17#include <linux/capability.h>
  18#include <linux/device.h>
  19#include <linux/memory.h>
  20#include <linux/memory_hotplug.h>
  21#include <linux/mm.h>
  22#include <linux/mutex.h>
  23#include <linux/stat.h>
  24#include <linux/slab.h>
  25
  26#include <linux/atomic.h>
  27#include <linux/uaccess.h>
  28
  29static DEFINE_MUTEX(mem_sysfs_mutex);
  30
  31#define MEMORY_CLASS_NAME       "memory"
  32
  33#define to_memory_block(dev) container_of(dev, struct memory_block, dev)
  34
  35static int sections_per_block;
  36
  37static inline unsigned long base_memory_block_id(unsigned long section_nr)
  38{
  39        return section_nr / sections_per_block;
  40}
  41
  42static inline unsigned long pfn_to_block_id(unsigned long pfn)
  43{
  44        return base_memory_block_id(pfn_to_section_nr(pfn));
  45}
  46
  47static inline unsigned long phys_to_block_id(unsigned long phys)
  48{
  49        return pfn_to_block_id(PFN_DOWN(phys));
  50}
  51
  52static int memory_subsys_online(struct device *dev);
  53static int memory_subsys_offline(struct device *dev);
  54
  55static struct bus_type memory_subsys = {
  56        .name = MEMORY_CLASS_NAME,
  57        .dev_name = MEMORY_CLASS_NAME,
  58        .online = memory_subsys_online,
  59        .offline = memory_subsys_offline,
  60};
  61
  62static BLOCKING_NOTIFIER_HEAD(memory_chain);
  63
  64int register_memory_notifier(struct notifier_block *nb)
  65{
  66        return blocking_notifier_chain_register(&memory_chain, nb);
  67}
  68EXPORT_SYMBOL(register_memory_notifier);
  69
  70void unregister_memory_notifier(struct notifier_block *nb)
  71{
  72        blocking_notifier_chain_unregister(&memory_chain, nb);
  73}
  74EXPORT_SYMBOL(unregister_memory_notifier);
  75
  76static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain);
  77
  78int register_memory_isolate_notifier(struct notifier_block *nb)
  79{
  80        return atomic_notifier_chain_register(&memory_isolate_chain, nb);
  81}
  82EXPORT_SYMBOL(register_memory_isolate_notifier);
  83
  84void unregister_memory_isolate_notifier(struct notifier_block *nb)
  85{
  86        atomic_notifier_chain_unregister(&memory_isolate_chain, nb);
  87}
  88EXPORT_SYMBOL(unregister_memory_isolate_notifier);
  89
  90static void memory_block_release(struct device *dev)
  91{
  92        struct memory_block *mem = to_memory_block(dev);
  93
  94        kfree(mem);
  95}
  96
  97unsigned long __weak memory_block_size_bytes(void)
  98{
  99        return MIN_MEMORY_BLOCK_SIZE;
 100}
 101EXPORT_SYMBOL_GPL(memory_block_size_bytes);
 102
 103/*
 104 * Show the first physical section index (number) of this memory block.
 105 */
 106static ssize_t phys_index_show(struct device *dev,
 107                               struct device_attribute *attr, char *buf)
 108{
 109        struct memory_block *mem = to_memory_block(dev);
 110        unsigned long phys_index;
 111
 112        phys_index = mem->start_section_nr / sections_per_block;
 113        return sprintf(buf, "%08lx\n", phys_index);
 114}
 115
 116/*
 117 * Show whether the memory block is likely to be offlineable (or is already
 118 * offline). Once offline, the memory block could be removed. The return
 119 * value does, however, not indicate that there is a way to remove the
 120 * memory block.
 121 */
 122static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
 123                              char *buf)
 124{
 125        struct memory_block *mem = to_memory_block(dev);
 126        unsigned long pfn;
 127        int ret = 1, i;
 128
 129        if (mem->state != MEM_ONLINE)
 130                goto out;
 131
 132        for (i = 0; i < sections_per_block; i++) {
 133                if (!present_section_nr(mem->start_section_nr + i))
 134                        continue;
 135                pfn = section_nr_to_pfn(mem->start_section_nr + i);
 136                ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
 137        }
 138
 139out:
 140        return sprintf(buf, "%d\n", ret);
 141}
 142
 143/*
 144 * online, offline, going offline, etc.
 145 */
 146static ssize_t state_show(struct device *dev, struct device_attribute *attr,
 147                          char *buf)
 148{
 149        struct memory_block *mem = to_memory_block(dev);
 150        ssize_t len = 0;
 151
 152        /*
 153         * We can probably put these states in a nice little array
 154         * so that they're not open-coded
 155         */
 156        switch (mem->state) {
 157        case MEM_ONLINE:
 158                len = sprintf(buf, "online\n");
 159                break;
 160        case MEM_OFFLINE:
 161                len = sprintf(buf, "offline\n");
 162                break;
 163        case MEM_GOING_OFFLINE:
 164                len = sprintf(buf, "going-offline\n");
 165                break;
 166        default:
 167                len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
 168                                mem->state);
 169                WARN_ON(1);
 170                break;
 171        }
 172
 173        return len;
 174}
 175
 176int memory_notify(unsigned long val, void *v)
 177{
 178        return blocking_notifier_call_chain(&memory_chain, val, v);
 179}
 180
 181int memory_isolate_notify(unsigned long val, void *v)
 182{
 183        return atomic_notifier_call_chain(&memory_isolate_chain, val, v);
 184}
 185
 186/*
 187 * The probe routines leave the pages uninitialized, just as the bootmem code
 188 * does. Make sure we do not access them, but instead use only information from
 189 * within sections.
 190 */
 191static bool pages_correctly_probed(unsigned long start_pfn)
 192{
 193        unsigned long section_nr = pfn_to_section_nr(start_pfn);
 194        unsigned long section_nr_end = section_nr + sections_per_block;
 195        unsigned long pfn = start_pfn;
 196
 197        /*
 198         * memmap between sections is not contiguous except with
 199         * SPARSEMEM_VMEMMAP. We lookup the page once per section
 200         * and assume memmap is contiguous within each section
 201         */
 202        for (; section_nr < section_nr_end; section_nr++) {
 203                if (WARN_ON_ONCE(!pfn_valid(pfn)))
 204                        return false;
 205
 206                if (!present_section_nr(section_nr)) {
 207                        pr_warn("section %ld pfn[%lx, %lx) not present\n",
 208                                section_nr, pfn, pfn + PAGES_PER_SECTION);
 209                        return false;
 210                } else if (!valid_section_nr(section_nr)) {
 211                        pr_warn("section %ld pfn[%lx, %lx) no valid memmap\n",
 212                                section_nr, pfn, pfn + PAGES_PER_SECTION);
 213                        return false;
 214                } else if (online_section_nr(section_nr)) {
 215                        pr_warn("section %ld pfn[%lx, %lx) is already online\n",
 216                                section_nr, pfn, pfn + PAGES_PER_SECTION);
 217                        return false;
 218                }
 219                pfn += PAGES_PER_SECTION;
 220        }
 221
 222        return true;
 223}
 224
 225/*
 226 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 227 * OK to have direct references to sparsemem variables in here.
 228 */
 229static int
 230memory_block_action(unsigned long start_section_nr, unsigned long action,
 231                    int online_type)
 232{
 233        unsigned long start_pfn;
 234        unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 235        int ret;
 236
 237        start_pfn = section_nr_to_pfn(start_section_nr);
 238
 239        switch (action) {
 240        case MEM_ONLINE:
 241                if (!pages_correctly_probed(start_pfn))
 242                        return -EBUSY;
 243
 244                ret = online_pages(start_pfn, nr_pages, online_type);
 245                break;
 246        case MEM_OFFLINE:
 247                ret = offline_pages(start_pfn, nr_pages);
 248                break;
 249        default:
 250                WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
 251                     "%ld\n", __func__, start_section_nr, action, action);
 252                ret = -EINVAL;
 253        }
 254
 255        return ret;
 256}
 257
 258static int memory_block_change_state(struct memory_block *mem,
 259                unsigned long to_state, unsigned long from_state_req)
 260{
 261        int ret = 0;
 262
 263        if (mem->state != from_state_req)
 264                return -EINVAL;
 265
 266        if (to_state == MEM_OFFLINE)
 267                mem->state = MEM_GOING_OFFLINE;
 268
 269        ret = memory_block_action(mem->start_section_nr, to_state,
 270                                mem->online_type);
 271
 272        mem->state = ret ? from_state_req : to_state;
 273
 274        return ret;
 275}
 276
 277/* The device lock serializes operations on memory_subsys_[online|offline] */
 278static int memory_subsys_online(struct device *dev)
 279{
 280        struct memory_block *mem = to_memory_block(dev);
 281        int ret;
 282
 283        if (mem->state == MEM_ONLINE)
 284                return 0;
 285
 286        /*
 287         * If we are called from state_store(), online_type will be
 288         * set >= 0 Otherwise we were called from the device online
 289         * attribute and need to set the online_type.
 290         */
 291        if (mem->online_type < 0)
 292                mem->online_type = MMOP_ONLINE_KEEP;
 293
 294        ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
 295
 296        /* clear online_type */
 297        mem->online_type = -1;
 298
 299        return ret;
 300}
 301
 302static int memory_subsys_offline(struct device *dev)
 303{
 304        struct memory_block *mem = to_memory_block(dev);
 305
 306        if (mem->state == MEM_OFFLINE)
 307                return 0;
 308
 309        /* Can't offline block with non-present sections */
 310        if (mem->section_count != sections_per_block)
 311                return -EINVAL;
 312
 313        return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
 314}
 315
 316static ssize_t state_store(struct device *dev, struct device_attribute *attr,
 317                           const char *buf, size_t count)
 318{
 319        struct memory_block *mem = to_memory_block(dev);
 320        int ret, online_type;
 321
 322        ret = lock_device_hotplug_sysfs();
 323        if (ret)
 324                return ret;
 325
 326        if (sysfs_streq(buf, "online_kernel"))
 327                online_type = MMOP_ONLINE_KERNEL;
 328        else if (sysfs_streq(buf, "online_movable"))
 329                online_type = MMOP_ONLINE_MOVABLE;
 330        else if (sysfs_streq(buf, "online"))
 331                online_type = MMOP_ONLINE_KEEP;
 332        else if (sysfs_streq(buf, "offline"))
 333                online_type = MMOP_OFFLINE;
 334        else {
 335                ret = -EINVAL;
 336                goto err;
 337        }
 338
 339        switch (online_type) {
 340        case MMOP_ONLINE_KERNEL:
 341        case MMOP_ONLINE_MOVABLE:
 342        case MMOP_ONLINE_KEEP:
 343                /* mem->online_type is protected by device_hotplug_lock */
 344                mem->online_type = online_type;
 345                ret = device_online(&mem->dev);
 346                break;
 347        case MMOP_OFFLINE:
 348                ret = device_offline(&mem->dev);
 349                break;
 350        default:
 351                ret = -EINVAL; /* should never happen */
 352        }
 353
 354err:
 355        unlock_device_hotplug();
 356
 357        if (ret < 0)
 358                return ret;
 359        if (ret)
 360                return -EINVAL;
 361
 362        return count;
 363}
 364
 365/*
 366 * phys_device is a bad name for this.  What I really want
 367 * is a way to differentiate between memory ranges that
 368 * are part of physical devices that constitute
 369 * a complete removable unit or fru.
 370 * i.e. do these ranges belong to the same physical device,
 371 * s.t. if I offline all of these sections I can then
 372 * remove the physical device?
 373 */
 374static ssize_t phys_device_show(struct device *dev,
 375                                struct device_attribute *attr, char *buf)
 376{
 377        struct memory_block *mem = to_memory_block(dev);
 378        return sprintf(buf, "%d\n", mem->phys_device);
 379}
 380
 381#ifdef CONFIG_MEMORY_HOTREMOVE
 382static void print_allowed_zone(char *buf, int nid, unsigned long start_pfn,
 383                unsigned long nr_pages, int online_type,
 384                struct zone *default_zone)
 385{
 386        struct zone *zone;
 387
 388        zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
 389        if (zone != default_zone) {
 390                strcat(buf, " ");
 391                strcat(buf, zone->name);
 392        }
 393}
 394
 395static ssize_t valid_zones_show(struct device *dev,
 396                                struct device_attribute *attr, char *buf)
 397{
 398        struct memory_block *mem = to_memory_block(dev);
 399        unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 400        unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 401        unsigned long valid_start_pfn, valid_end_pfn;
 402        struct zone *default_zone;
 403        int nid;
 404
 405        /*
 406         * Check the existing zone. Make sure that we do that only on the
 407         * online nodes otherwise the page_zone is not reliable
 408         */
 409        if (mem->state == MEM_ONLINE) {
 410                /*
 411                 * The block contains more than one zone can not be offlined.
 412                 * This can happen e.g. for ZONE_DMA and ZONE_DMA32
 413                 */
 414                if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages,
 415                                          &valid_start_pfn, &valid_end_pfn))
 416                        return sprintf(buf, "none\n");
 417                start_pfn = valid_start_pfn;
 418                strcat(buf, page_zone(pfn_to_page(start_pfn))->name);
 419                goto out;
 420        }
 421
 422        nid = mem->nid;
 423        default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages);
 424        strcat(buf, default_zone->name);
 425
 426        print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL,
 427                        default_zone);
 428        print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE,
 429                        default_zone);
 430out:
 431        strcat(buf, "\n");
 432
 433        return strlen(buf);
 434}
 435static DEVICE_ATTR_RO(valid_zones);
 436#endif
 437
 438static DEVICE_ATTR_RO(phys_index);
 439static DEVICE_ATTR_RW(state);
 440static DEVICE_ATTR_RO(phys_device);
 441static DEVICE_ATTR_RO(removable);
 442
 443/*
 444 * Show the memory block size (shared by all memory blocks).
 445 */
 446static ssize_t block_size_bytes_show(struct device *dev,
 447                                     struct device_attribute *attr, char *buf)
 448{
 449        return sprintf(buf, "%lx\n", memory_block_size_bytes());
 450}
 451
 452static DEVICE_ATTR_RO(block_size_bytes);
 453
 454/*
 455 * Memory auto online policy.
 456 */
 457
 458static ssize_t auto_online_blocks_show(struct device *dev,
 459                                       struct device_attribute *attr, char *buf)
 460{
 461        if (memhp_auto_online)
 462                return sprintf(buf, "online\n");
 463        else
 464                return sprintf(buf, "offline\n");
 465}
 466
 467static ssize_t auto_online_blocks_store(struct device *dev,
 468                                        struct device_attribute *attr,
 469                                        const char *buf, size_t count)
 470{
 471        if (sysfs_streq(buf, "online"))
 472                memhp_auto_online = true;
 473        else if (sysfs_streq(buf, "offline"))
 474                memhp_auto_online = false;
 475        else
 476                return -EINVAL;
 477
 478        return count;
 479}
 480
 481static DEVICE_ATTR_RW(auto_online_blocks);
 482
 483/*
 484 * Some architectures will have custom drivers to do this, and
 485 * will not need to do it from userspace.  The fake hot-add code
 486 * as well as ppc64 will do all of their discovery in userspace
 487 * and will require this interface.
 488 */
 489#ifdef CONFIG_ARCH_MEMORY_PROBE
 490static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
 491                           const char *buf, size_t count)
 492{
 493        u64 phys_addr;
 494        int nid, ret;
 495        unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
 496
 497        ret = kstrtoull(buf, 0, &phys_addr);
 498        if (ret)
 499                return ret;
 500
 501        if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
 502                return -EINVAL;
 503
 504        ret = lock_device_hotplug_sysfs();
 505        if (ret)
 506                return ret;
 507
 508        nid = memory_add_physaddr_to_nid(phys_addr);
 509        ret = __add_memory(nid, phys_addr,
 510                           MIN_MEMORY_BLOCK_SIZE * sections_per_block);
 511
 512        if (ret)
 513                goto out;
 514
 515        ret = count;
 516out:
 517        unlock_device_hotplug();
 518        return ret;
 519}
 520
 521static DEVICE_ATTR_WO(probe);
 522#endif
 523
 524#ifdef CONFIG_MEMORY_FAILURE
 525/*
 526 * Support for offlining pages of memory
 527 */
 528
 529/* Soft offline a page */
 530static ssize_t soft_offline_page_store(struct device *dev,
 531                                       struct device_attribute *attr,
 532                                       const char *buf, size_t count)
 533{
 534        int ret;
 535        u64 pfn;
 536        if (!capable(CAP_SYS_ADMIN))
 537                return -EPERM;
 538        if (kstrtoull(buf, 0, &pfn) < 0)
 539                return -EINVAL;
 540        pfn >>= PAGE_SHIFT;
 541        if (!pfn_valid(pfn))
 542                return -ENXIO;
 543        /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
 544        if (!pfn_to_online_page(pfn))
 545                return -EIO;
 546        ret = soft_offline_page(pfn_to_page(pfn), 0);
 547        return ret == 0 ? count : ret;
 548}
 549
 550/* Forcibly offline a page, including killing processes. */
 551static ssize_t hard_offline_page_store(struct device *dev,
 552                                       struct device_attribute *attr,
 553                                       const char *buf, size_t count)
 554{
 555        int ret;
 556        u64 pfn;
 557        if (!capable(CAP_SYS_ADMIN))
 558                return -EPERM;
 559        if (kstrtoull(buf, 0, &pfn) < 0)
 560                return -EINVAL;
 561        pfn >>= PAGE_SHIFT;
 562        ret = memory_failure(pfn, 0);
 563        return ret ? ret : count;
 564}
 565
 566static DEVICE_ATTR_WO(soft_offline_page);
 567static DEVICE_ATTR_WO(hard_offline_page);
 568#endif
 569
 570/*
 571 * Note that phys_device is optional.  It is here to allow for
 572 * differentiation between which *physical* devices each
 573 * section belongs to...
 574 */
 575int __weak arch_get_memory_phys_device(unsigned long start_pfn)
 576{
 577        return 0;
 578}
 579
 580/* A reference for the returned memory block device is acquired. */
 581static struct memory_block *find_memory_block_by_id(unsigned long block_id)
 582{
 583        struct device *dev;
 584
 585        dev = subsys_find_device_by_id(&memory_subsys, block_id, NULL);
 586        return dev ? to_memory_block(dev) : NULL;
 587}
 588
 589/*
 590 * For now, we have a linear search to go find the appropriate
 591 * memory_block corresponding to a particular phys_index. If
 592 * this gets to be a real problem, we can always use a radix
 593 * tree or something here.
 594 *
 595 * This could be made generic for all device subsystems.
 596 */
 597struct memory_block *find_memory_block(struct mem_section *section)
 598{
 599        unsigned long block_id = base_memory_block_id(__section_nr(section));
 600
 601        return find_memory_block_by_id(block_id);
 602}
 603
 604static struct attribute *memory_memblk_attrs[] = {
 605        &dev_attr_phys_index.attr,
 606        &dev_attr_state.attr,
 607        &dev_attr_phys_device.attr,
 608        &dev_attr_removable.attr,
 609#ifdef CONFIG_MEMORY_HOTREMOVE
 610        &dev_attr_valid_zones.attr,
 611#endif
 612        NULL
 613};
 614
 615static struct attribute_group memory_memblk_attr_group = {
 616        .attrs = memory_memblk_attrs,
 617};
 618
 619static const struct attribute_group *memory_memblk_attr_groups[] = {
 620        &memory_memblk_attr_group,
 621        NULL,
 622};
 623
 624/*
 625 * register_memory - Setup a sysfs device for a memory block
 626 */
 627static
 628int register_memory(struct memory_block *memory)
 629{
 630        int ret;
 631
 632        memory->dev.bus = &memory_subsys;
 633        memory->dev.id = memory->start_section_nr / sections_per_block;
 634        memory->dev.release = memory_block_release;
 635        memory->dev.groups = memory_memblk_attr_groups;
 636        memory->dev.offline = memory->state == MEM_OFFLINE;
 637
 638        ret = device_register(&memory->dev);
 639        if (ret)
 640                put_device(&memory->dev);
 641
 642        return ret;
 643}
 644
 645static int init_memory_block(struct memory_block **memory,
 646                             unsigned long block_id, unsigned long state)
 647{
 648        struct memory_block *mem;
 649        unsigned long start_pfn;
 650        int ret = 0;
 651
 652        mem = find_memory_block_by_id(block_id);
 653        if (mem) {
 654                put_device(&mem->dev);
 655                return -EEXIST;
 656        }
 657        mem = kzalloc(sizeof(*mem), GFP_KERNEL);
 658        if (!mem)
 659                return -ENOMEM;
 660
 661        mem->start_section_nr = block_id * sections_per_block;
 662        mem->state = state;
 663        start_pfn = section_nr_to_pfn(mem->start_section_nr);
 664        mem->phys_device = arch_get_memory_phys_device(start_pfn);
 665        mem->nid = NUMA_NO_NODE;
 666
 667        ret = register_memory(mem);
 668
 669        *memory = mem;
 670        return ret;
 671}
 672
 673static int add_memory_block(unsigned long base_section_nr)
 674{
 675        int ret, section_count = 0;
 676        struct memory_block *mem;
 677        unsigned long nr;
 678
 679        for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
 680             nr++)
 681                if (present_section_nr(nr))
 682                        section_count++;
 683
 684        if (section_count == 0)
 685                return 0;
 686        ret = init_memory_block(&mem, base_memory_block_id(base_section_nr),
 687                                MEM_ONLINE);
 688        if (ret)
 689                return ret;
 690        mem->section_count = section_count;
 691        return 0;
 692}
 693
 694static void unregister_memory(struct memory_block *memory)
 695{
 696        if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
 697                return;
 698
 699        /* drop the ref. we got via find_memory_block() */
 700        put_device(&memory->dev);
 701        device_unregister(&memory->dev);
 702}
 703
 704/*
 705 * Create memory block devices for the given memory area. Start and size
 706 * have to be aligned to memory block granularity. Memory block devices
 707 * will be initialized as offline.
 708 */
 709int create_memory_block_devices(unsigned long start, unsigned long size)
 710{
 711        const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
 712        unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 713        struct memory_block *mem;
 714        unsigned long block_id;
 715        int ret = 0;
 716
 717        if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
 718                         !IS_ALIGNED(size, memory_block_size_bytes())))
 719                return -EINVAL;
 720
 721        mutex_lock(&mem_sysfs_mutex);
 722        for (block_id = start_block_id; block_id != end_block_id; block_id++) {
 723                ret = init_memory_block(&mem, block_id, MEM_OFFLINE);
 724                if (ret)
 725                        break;
 726                mem->section_count = sections_per_block;
 727        }
 728        if (ret) {
 729                end_block_id = block_id;
 730                for (block_id = start_block_id; block_id != end_block_id;
 731                     block_id++) {
 732                        mem = find_memory_block_by_id(block_id);
 733                        mem->section_count = 0;
 734                        unregister_memory(mem);
 735                }
 736        }
 737        mutex_unlock(&mem_sysfs_mutex);
 738        return ret;
 739}
 740
 741/*
 742 * Remove memory block devices for the given memory area. Start and size
 743 * have to be aligned to memory block granularity. Memory block devices
 744 * have to be offline.
 745 */
 746void remove_memory_block_devices(unsigned long start, unsigned long size)
 747{
 748        const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
 749        const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 750        struct memory_block *mem;
 751        unsigned long block_id;
 752
 753        if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
 754                         !IS_ALIGNED(size, memory_block_size_bytes())))
 755                return;
 756
 757        mutex_lock(&mem_sysfs_mutex);
 758        for (block_id = start_block_id; block_id != end_block_id; block_id++) {
 759                mem = find_memory_block_by_id(block_id);
 760                if (WARN_ON_ONCE(!mem))
 761                        continue;
 762                mem->section_count = 0;
 763                unregister_memory_block_under_nodes(mem);
 764                unregister_memory(mem);
 765        }
 766        mutex_unlock(&mem_sysfs_mutex);
 767}
 768
 769/* return true if the memory block is offlined, otherwise, return false */
 770bool is_memblock_offlined(struct memory_block *mem)
 771{
 772        return mem->state == MEM_OFFLINE;
 773}
 774
 775static struct attribute *memory_root_attrs[] = {
 776#ifdef CONFIG_ARCH_MEMORY_PROBE
 777        &dev_attr_probe.attr,
 778#endif
 779
 780#ifdef CONFIG_MEMORY_FAILURE
 781        &dev_attr_soft_offline_page.attr,
 782        &dev_attr_hard_offline_page.attr,
 783#endif
 784
 785        &dev_attr_block_size_bytes.attr,
 786        &dev_attr_auto_online_blocks.attr,
 787        NULL
 788};
 789
 790static struct attribute_group memory_root_attr_group = {
 791        .attrs = memory_root_attrs,
 792};
 793
 794static const struct attribute_group *memory_root_attr_groups[] = {
 795        &memory_root_attr_group,
 796        NULL,
 797};
 798
 799/*
 800 * Initialize the sysfs support for memory devices...
 801 */
 802void __init memory_dev_init(void)
 803{
 804        int ret;
 805        int err;
 806        unsigned long block_sz, nr;
 807
 808        /* Validate the configured memory block size */
 809        block_sz = memory_block_size_bytes();
 810        if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
 811                panic("Memory block size not suitable: 0x%lx\n", block_sz);
 812        sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
 813
 814        ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
 815        if (ret)
 816                goto out;
 817
 818        /*
 819         * Create entries for memory sections that were found
 820         * during boot and have been initialized
 821         */
 822        mutex_lock(&mem_sysfs_mutex);
 823        for (nr = 0; nr <= __highest_present_section_nr;
 824             nr += sections_per_block) {
 825                err = add_memory_block(nr);
 826                if (!ret)
 827                        ret = err;
 828        }
 829        mutex_unlock(&mem_sysfs_mutex);
 830
 831out:
 832        if (ret)
 833                panic("%s() failed: %d\n", __func__, ret);
 834}
 835
 836/**
 837 * walk_memory_blocks - walk through all present memory blocks overlapped
 838 *                      by the range [start, start + size)
 839 *
 840 * @start: start address of the memory range
 841 * @size: size of the memory range
 842 * @arg: argument passed to func
 843 * @func: callback for each memory section walked
 844 *
 845 * This function walks through all present memory blocks overlapped by the
 846 * range [start, start + size), calling func on each memory block.
 847 *
 848 * In case func() returns an error, walking is aborted and the error is
 849 * returned.
 850 */
 851int walk_memory_blocks(unsigned long start, unsigned long size,
 852                       void *arg, walk_memory_blocks_func_t func)
 853{
 854        const unsigned long start_block_id = phys_to_block_id(start);
 855        const unsigned long end_block_id = phys_to_block_id(start + size - 1);
 856        struct memory_block *mem;
 857        unsigned long block_id;
 858        int ret = 0;
 859
 860        if (!size)
 861                return 0;
 862
 863        for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
 864                mem = find_memory_block_by_id(block_id);
 865                if (!mem)
 866                        continue;
 867
 868                ret = func(mem, arg);
 869                put_device(&mem->dev);
 870                if (ret)
 871                        break;
 872        }
 873        return ret;
 874}
 875
 876struct for_each_memory_block_cb_data {
 877        walk_memory_blocks_func_t func;
 878        void *arg;
 879};
 880
 881static int for_each_memory_block_cb(struct device *dev, void *data)
 882{
 883        struct memory_block *mem = to_memory_block(dev);
 884        struct for_each_memory_block_cb_data *cb_data = data;
 885
 886        return cb_data->func(mem, cb_data->arg);
 887}
 888
 889/**
 890 * for_each_memory_block - walk through all present memory blocks
 891 *
 892 * @arg: argument passed to func
 893 * @func: callback for each memory block walked
 894 *
 895 * This function walks through all present memory blocks, calling func on
 896 * each memory block.
 897 *
 898 * In case func() returns an error, walking is aborted and the error is
 899 * returned.
 900 */
 901int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
 902{
 903        struct for_each_memory_block_cb_data cb_data = {
 904                .func = func,
 905                .arg = arg,
 906        };
 907
 908        return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
 909                                for_each_memory_block_cb);
 910}
 911