linux/drivers/nvdimm/btt.c
<<
>>
Prefs
   1/*
   2 * Block Translation Table
   3 * Copyright (c) 2014-2015, Intel Corporation.
   4 *
   5 * This program is free software; you can redistribute it and/or modify it
   6 * under the terms and conditions of the GNU General Public License,
   7 * version 2, as published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope it will be useful, but WITHOUT
  10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  12 * more details.
  13 */
  14#include <linux/highmem.h>
  15#include <linux/debugfs.h>
  16#include <linux/blkdev.h>
  17#include <linux/module.h>
  18#include <linux/device.h>
  19#include <linux/mutex.h>
  20#include <linux/hdreg.h>
  21#include <linux/genhd.h>
  22#include <linux/sizes.h>
  23#include <linux/ndctl.h>
  24#include <linux/fs.h>
  25#include <linux/nd.h>
  26#include <linux/backing-dev.h>
  27#include "btt.h"
  28#include "nd.h"
  29
  30enum log_ent_request {
  31        LOG_NEW_ENT = 0,
  32        LOG_OLD_ENT
  33};
  34
  35static struct device *to_dev(struct arena_info *arena)
  36{
  37        return &arena->nd_btt->dev;
  38}
  39
  40static u64 adjust_initial_offset(struct nd_btt *nd_btt, u64 offset)
  41{
  42        return offset + nd_btt->initial_offset;
  43}
  44
  45static int arena_read_bytes(struct arena_info *arena, resource_size_t offset,
  46                void *buf, size_t n, unsigned long flags)
  47{
  48        struct nd_btt *nd_btt = arena->nd_btt;
  49        struct nd_namespace_common *ndns = nd_btt->ndns;
  50
  51        /* arena offsets may be shifted from the base of the device */
  52        offset = adjust_initial_offset(nd_btt, offset);
  53        return nvdimm_read_bytes(ndns, offset, buf, n, flags);
  54}
  55
  56static int arena_write_bytes(struct arena_info *arena, resource_size_t offset,
  57                void *buf, size_t n, unsigned long flags)
  58{
  59        struct nd_btt *nd_btt = arena->nd_btt;
  60        struct nd_namespace_common *ndns = nd_btt->ndns;
  61
  62        /* arena offsets may be shifted from the base of the device */
  63        offset = adjust_initial_offset(nd_btt, offset);
  64        return nvdimm_write_bytes(ndns, offset, buf, n, flags);
  65}
  66
  67static int btt_info_write(struct arena_info *arena, struct btt_sb *super)
  68{
  69        int ret;
  70
  71        /*
  72         * infooff and info2off should always be at least 512B aligned.
  73         * We rely on that to make sure rw_bytes does error clearing
  74         * correctly, so make sure that is the case.
  75         */
  76        dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->infooff, 512),
  77                "arena->infooff: %#llx is unaligned\n", arena->infooff);
  78        dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->info2off, 512),
  79                "arena->info2off: %#llx is unaligned\n", arena->info2off);
  80
  81        ret = arena_write_bytes(arena, arena->info2off, super,
  82                        sizeof(struct btt_sb), 0);
  83        if (ret)
  84                return ret;
  85
  86        return arena_write_bytes(arena, arena->infooff, super,
  87                        sizeof(struct btt_sb), 0);
  88}
  89
  90static int btt_info_read(struct arena_info *arena, struct btt_sb *super)
  91{
  92        return arena_read_bytes(arena, arena->infooff, super,
  93                        sizeof(struct btt_sb), 0);
  94}
  95
  96/*
  97 * 'raw' version of btt_map write
  98 * Assumptions:
  99 *   mapping is in little-endian
 100 *   mapping contains 'E' and 'Z' flags as desired
 101 */
 102static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping,
 103                unsigned long flags)
 104{
 105        u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
 106
 107        if (unlikely(lba >= arena->external_nlba))
 108                dev_err_ratelimited(to_dev(arena),
 109                        "%s: lba %#x out of range (max: %#x)\n",
 110                        __func__, lba, arena->external_nlba);
 111        return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE, flags);
 112}
 113
 114static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping,
 115                        u32 z_flag, u32 e_flag, unsigned long rwb_flags)
 116{
 117        u32 ze;
 118        __le32 mapping_le;
 119
 120        /*
 121         * This 'mapping' is supposed to be just the LBA mapping, without
 122         * any flags set, so strip the flag bits.
 123         */
 124        mapping = ent_lba(mapping);
 125
 126        ze = (z_flag << 1) + e_flag;
 127        switch (ze) {
 128        case 0:
 129                /*
 130                 * We want to set neither of the Z or E flags, and
 131                 * in the actual layout, this means setting the bit
 132                 * positions of both to '1' to indicate a 'normal'
 133                 * map entry
 134                 */
 135                mapping |= MAP_ENT_NORMAL;
 136                break;
 137        case 1:
 138                mapping |= (1 << MAP_ERR_SHIFT);
 139                break;
 140        case 2:
 141                mapping |= (1 << MAP_TRIM_SHIFT);
 142                break;
 143        default:
 144                /*
 145                 * The case where Z and E are both sent in as '1' could be
 146                 * construed as a valid 'normal' case, but we decide not to,
 147                 * to avoid confusion
 148                 */
 149                dev_err_ratelimited(to_dev(arena),
 150                        "Invalid use of Z and E flags\n");
 151                return -EIO;
 152        }
 153
 154        mapping_le = cpu_to_le32(mapping);
 155        return __btt_map_write(arena, lba, mapping_le, rwb_flags);
 156}
 157
 158static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
 159                        int *trim, int *error, unsigned long rwb_flags)
 160{
 161        int ret;
 162        __le32 in;
 163        u32 raw_mapping, postmap, ze, z_flag, e_flag;
 164        u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
 165
 166        if (unlikely(lba >= arena->external_nlba))
 167                dev_err_ratelimited(to_dev(arena),
 168                        "%s: lba %#x out of range (max: %#x)\n",
 169                        __func__, lba, arena->external_nlba);
 170
 171        ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE, rwb_flags);
 172        if (ret)
 173                return ret;
 174
 175        raw_mapping = le32_to_cpu(in);
 176
 177        z_flag = ent_z_flag(raw_mapping);
 178        e_flag = ent_e_flag(raw_mapping);
 179        ze = (z_flag << 1) + e_flag;
 180        postmap = ent_lba(raw_mapping);
 181
 182        /* Reuse the {z,e}_flag variables for *trim and *error */
 183        z_flag = 0;
 184        e_flag = 0;
 185
 186        switch (ze) {
 187        case 0:
 188                /* Initial state. Return postmap = premap */
 189                *mapping = lba;
 190                break;
 191        case 1:
 192                *mapping = postmap;
 193                e_flag = 1;
 194                break;
 195        case 2:
 196                *mapping = postmap;
 197                z_flag = 1;
 198                break;
 199        case 3:
 200                *mapping = postmap;
 201                break;
 202        default:
 203                return -EIO;
 204        }
 205
 206        if (trim)
 207                *trim = z_flag;
 208        if (error)
 209                *error = e_flag;
 210
 211        return ret;
 212}
 213
 214static int btt_log_group_read(struct arena_info *arena, u32 lane,
 215                        struct log_group *log)
 216{
 217        return arena_read_bytes(arena,
 218                        arena->logoff + (lane * LOG_GRP_SIZE), log,
 219                        LOG_GRP_SIZE, 0);
 220}
 221
 222static struct dentry *debugfs_root;
 223
 224static void arena_debugfs_init(struct arena_info *a, struct dentry *parent,
 225                                int idx)
 226{
 227        char dirname[32];
 228        struct dentry *d;
 229
 230        /* If for some reason, parent bttN was not created, exit */
 231        if (!parent)
 232                return;
 233
 234        snprintf(dirname, 32, "arena%d", idx);
 235        d = debugfs_create_dir(dirname, parent);
 236        if (IS_ERR_OR_NULL(d))
 237                return;
 238        a->debugfs_dir = d;
 239
 240        debugfs_create_x64("size", S_IRUGO, d, &a->size);
 241        debugfs_create_x64("external_lba_start", S_IRUGO, d,
 242                                &a->external_lba_start);
 243        debugfs_create_x32("internal_nlba", S_IRUGO, d, &a->internal_nlba);
 244        debugfs_create_u32("internal_lbasize", S_IRUGO, d,
 245                                &a->internal_lbasize);
 246        debugfs_create_x32("external_nlba", S_IRUGO, d, &a->external_nlba);
 247        debugfs_create_u32("external_lbasize", S_IRUGO, d,
 248                                &a->external_lbasize);
 249        debugfs_create_u32("nfree", S_IRUGO, d, &a->nfree);
 250        debugfs_create_u16("version_major", S_IRUGO, d, &a->version_major);
 251        debugfs_create_u16("version_minor", S_IRUGO, d, &a->version_minor);
 252        debugfs_create_x64("nextoff", S_IRUGO, d, &a->nextoff);
 253        debugfs_create_x64("infooff", S_IRUGO, d, &a->infooff);
 254        debugfs_create_x64("dataoff", S_IRUGO, d, &a->dataoff);
 255        debugfs_create_x64("mapoff", S_IRUGO, d, &a->mapoff);
 256        debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff);
 257        debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off);
 258        debugfs_create_x32("flags", S_IRUGO, d, &a->flags);
 259        debugfs_create_u32("log_index_0", S_IRUGO, d, &a->log_index[0]);
 260        debugfs_create_u32("log_index_1", S_IRUGO, d, &a->log_index[1]);
 261}
 262
 263static void btt_debugfs_init(struct btt *btt)
 264{
 265        int i = 0;
 266        struct arena_info *arena;
 267
 268        btt->debugfs_dir = debugfs_create_dir(dev_name(&btt->nd_btt->dev),
 269                                                debugfs_root);
 270        if (IS_ERR_OR_NULL(btt->debugfs_dir))
 271                return;
 272
 273        list_for_each_entry(arena, &btt->arena_list, list) {
 274                arena_debugfs_init(arena, btt->debugfs_dir, i);
 275                i++;
 276        }
 277}
 278
 279static u32 log_seq(struct log_group *log, int log_idx)
 280{
 281        return le32_to_cpu(log->ent[log_idx].seq);
 282}
 283
 284/*
 285 * This function accepts two log entries, and uses the
 286 * sequence number to find the 'older' entry.
 287 * It also updates the sequence number in this old entry to
 288 * make it the 'new' one if the mark_flag is set.
 289 * Finally, it returns which of the entries was the older one.
 290 *
 291 * TODO The logic feels a bit kludge-y. make it better..
 292 */
 293static int btt_log_get_old(struct arena_info *a, struct log_group *log)
 294{
 295        int idx0 = a->log_index[0];
 296        int idx1 = a->log_index[1];
 297        int old;
 298
 299        /*
 300         * the first ever time this is seen, the entry goes into [0]
 301         * the next time, the following logic works out to put this
 302         * (next) entry into [1]
 303         */
 304        if (log_seq(log, idx0) == 0) {
 305                log->ent[idx0].seq = cpu_to_le32(1);
 306                return 0;
 307        }
 308
 309        if (log_seq(log, idx0) == log_seq(log, idx1))
 310                return -EINVAL;
 311        if (log_seq(log, idx0) + log_seq(log, idx1) > 5)
 312                return -EINVAL;
 313
 314        if (log_seq(log, idx0) < log_seq(log, idx1)) {
 315                if ((log_seq(log, idx1) - log_seq(log, idx0)) == 1)
 316                        old = 0;
 317                else
 318                        old = 1;
 319        } else {
 320                if ((log_seq(log, idx0) - log_seq(log, idx1)) == 1)
 321                        old = 1;
 322                else
 323                        old = 0;
 324        }
 325
 326        return old;
 327}
 328
 329/*
 330 * This function copies the desired (old/new) log entry into ent if
 331 * it is not NULL. It returns the sub-slot number (0 or 1)
 332 * where the desired log entry was found. Negative return values
 333 * indicate errors.
 334 */
 335static int btt_log_read(struct arena_info *arena, u32 lane,
 336                        struct log_entry *ent, int old_flag)
 337{
 338        int ret;
 339        int old_ent, ret_ent;
 340        struct log_group log;
 341
 342        ret = btt_log_group_read(arena, lane, &log);
 343        if (ret)
 344                return -EIO;
 345
 346        old_ent = btt_log_get_old(arena, &log);
 347        if (old_ent < 0 || old_ent > 1) {
 348                dev_err(to_dev(arena),
 349                                "log corruption (%d): lane %d seq [%d, %d]\n",
 350                                old_ent, lane, log.ent[arena->log_index[0]].seq,
 351                                log.ent[arena->log_index[1]].seq);
 352                /* TODO set error state? */
 353                return -EIO;
 354        }
 355
 356        ret_ent = (old_flag ? old_ent : (1 - old_ent));
 357
 358        if (ent != NULL)
 359                memcpy(ent, &log.ent[arena->log_index[ret_ent]], LOG_ENT_SIZE);
 360
 361        return ret_ent;
 362}
 363
 364/*
 365 * This function commits a log entry to media
 366 * It does _not_ prepare the freelist entry for the next write
 367 * btt_flog_write is the wrapper for updating the freelist elements
 368 */
 369static int __btt_log_write(struct arena_info *arena, u32 lane,
 370                        u32 sub, struct log_entry *ent, unsigned long flags)
 371{
 372        int ret;
 373        u32 group_slot = arena->log_index[sub];
 374        unsigned int log_half = LOG_ENT_SIZE / 2;
 375        void *src = ent;
 376        u64 ns_off;
 377
 378        ns_off = arena->logoff + (lane * LOG_GRP_SIZE) +
 379                (group_slot * LOG_ENT_SIZE);
 380        /* split the 16B write into atomic, durable halves */
 381        ret = arena_write_bytes(arena, ns_off, src, log_half, flags);
 382        if (ret)
 383                return ret;
 384
 385        ns_off += log_half;
 386        src += log_half;
 387        return arena_write_bytes(arena, ns_off, src, log_half, flags);
 388}
 389
 390static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub,
 391                        struct log_entry *ent)
 392{
 393        int ret;
 394
 395        ret = __btt_log_write(arena, lane, sub, ent, NVDIMM_IO_ATOMIC);
 396        if (ret)
 397                return ret;
 398
 399        /* prepare the next free entry */
 400        arena->freelist[lane].sub = 1 - arena->freelist[lane].sub;
 401        if (++(arena->freelist[lane].seq) == 4)
 402                arena->freelist[lane].seq = 1;
 403        if (ent_e_flag(ent->old_map))
 404                arena->freelist[lane].has_err = 1;
 405        arena->freelist[lane].block = le32_to_cpu(ent_lba(ent->old_map));
 406
 407        return ret;
 408}
 409
 410/*
 411 * This function initializes the BTT map to the initial state, which is
 412 * all-zeroes, and indicates an identity mapping
 413 */
 414static int btt_map_init(struct arena_info *arena)
 415{
 416        int ret = -EINVAL;
 417        void *zerobuf;
 418        size_t offset = 0;
 419        size_t chunk_size = SZ_2M;
 420        size_t mapsize = arena->logoff - arena->mapoff;
 421
 422        zerobuf = kzalloc(chunk_size, GFP_KERNEL);
 423        if (!zerobuf)
 424                return -ENOMEM;
 425
 426        /*
 427         * mapoff should always be at least 512B  aligned. We rely on that to
 428         * make sure rw_bytes does error clearing correctly, so make sure that
 429         * is the case.
 430         */
 431        dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->mapoff, 512),
 432                "arena->mapoff: %#llx is unaligned\n", arena->mapoff);
 433
 434        while (mapsize) {
 435                size_t size = min(mapsize, chunk_size);
 436
 437                dev_WARN_ONCE(to_dev(arena), size < 512,
 438                        "chunk size: %#zx is unaligned\n", size);
 439                ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf,
 440                                size, 0);
 441                if (ret)
 442                        goto free;
 443
 444                offset += size;
 445                mapsize -= size;
 446                cond_resched();
 447        }
 448
 449 free:
 450        kfree(zerobuf);
 451        return ret;
 452}
 453
 454/*
 455 * This function initializes the BTT log with 'fake' entries pointing
 456 * to the initial reserved set of blocks as being free
 457 */
 458static int btt_log_init(struct arena_info *arena)
 459{
 460        size_t logsize = arena->info2off - arena->logoff;
 461        size_t chunk_size = SZ_4K, offset = 0;
 462        struct log_entry ent;
 463        void *zerobuf;
 464        int ret;
 465        u32 i;
 466
 467        zerobuf = kzalloc(chunk_size, GFP_KERNEL);
 468        if (!zerobuf)
 469                return -ENOMEM;
 470        /*
 471         * logoff should always be at least 512B  aligned. We rely on that to
 472         * make sure rw_bytes does error clearing correctly, so make sure that
 473         * is the case.
 474         */
 475        dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->logoff, 512),
 476                "arena->logoff: %#llx is unaligned\n", arena->logoff);
 477
 478        while (logsize) {
 479                size_t size = min(logsize, chunk_size);
 480
 481                dev_WARN_ONCE(to_dev(arena), size < 512,
 482                        "chunk size: %#zx is unaligned\n", size);
 483                ret = arena_write_bytes(arena, arena->logoff + offset, zerobuf,
 484                                size, 0);
 485                if (ret)
 486                        goto free;
 487
 488                offset += size;
 489                logsize -= size;
 490                cond_resched();
 491        }
 492
 493        for (i = 0; i < arena->nfree; i++) {
 494                ent.lba = cpu_to_le32(i);
 495                ent.old_map = cpu_to_le32(arena->external_nlba + i);
 496                ent.new_map = cpu_to_le32(arena->external_nlba + i);
 497                ent.seq = cpu_to_le32(LOG_SEQ_INIT);
 498                ret = __btt_log_write(arena, i, 0, &ent, 0);
 499                if (ret)
 500                        goto free;
 501        }
 502
 503 free:
 504        kfree(zerobuf);
 505        return ret;
 506}
 507
 508static u64 to_namespace_offset(struct arena_info *arena, u64 lba)
 509{
 510        return arena->dataoff + ((u64)lba * arena->internal_lbasize);
 511}
 512
 513static int arena_clear_freelist_error(struct arena_info *arena, u32 lane)
 514{
 515        int ret = 0;
 516
 517        if (arena->freelist[lane].has_err) {
 518                void *zero_page = page_address(ZERO_PAGE(0));
 519                u32 lba = arena->freelist[lane].block;
 520                u64 nsoff = to_namespace_offset(arena, lba);
 521                unsigned long len = arena->sector_size;
 522
 523                mutex_lock(&arena->err_lock);
 524
 525                while (len) {
 526                        unsigned long chunk = min(len, PAGE_SIZE);
 527
 528                        ret = arena_write_bytes(arena, nsoff, zero_page,
 529                                chunk, 0);
 530                        if (ret)
 531                                break;
 532                        len -= chunk;
 533                        nsoff += chunk;
 534                        if (len == 0)
 535                                arena->freelist[lane].has_err = 0;
 536                }
 537                mutex_unlock(&arena->err_lock);
 538        }
 539        return ret;
 540}
 541
 542static int btt_freelist_init(struct arena_info *arena)
 543{
 544        int new, ret;
 545        struct log_entry log_new;
 546        u32 i, map_entry, log_oldmap, log_newmap;
 547
 548        arena->freelist = kcalloc(arena->nfree, sizeof(struct free_entry),
 549                                        GFP_KERNEL);
 550        if (!arena->freelist)
 551                return -ENOMEM;
 552
 553        for (i = 0; i < arena->nfree; i++) {
 554                new = btt_log_read(arena, i, &log_new, LOG_NEW_ENT);
 555                if (new < 0)
 556                        return new;
 557
 558                /* old and new map entries with any flags stripped out */
 559                log_oldmap = ent_lba(le32_to_cpu(log_new.old_map));
 560                log_newmap = ent_lba(le32_to_cpu(log_new.new_map));
 561
 562                /* sub points to the next one to be overwritten */
 563                arena->freelist[i].sub = 1 - new;
 564                arena->freelist[i].seq = nd_inc_seq(le32_to_cpu(log_new.seq));
 565                arena->freelist[i].block = log_oldmap;
 566
 567                /*
 568                 * FIXME: if error clearing fails during init, we want to make
 569                 * the BTT read-only
 570                 */
 571                if (ent_e_flag(log_new.old_map) &&
 572                                !ent_normal(log_new.old_map)) {
 573                        arena->freelist[i].has_err = 1;
 574                        ret = arena_clear_freelist_error(arena, i);
 575                        if (ret)
 576                                dev_err_ratelimited(to_dev(arena),
 577                                        "Unable to clear known errors\n");
 578                }
 579
 580                /* This implies a newly created or untouched flog entry */
 581                if (log_oldmap == log_newmap)
 582                        continue;
 583
 584                /* Check if map recovery is needed */
 585                ret = btt_map_read(arena, le32_to_cpu(log_new.lba), &map_entry,
 586                                NULL, NULL, 0);
 587                if (ret)
 588                        return ret;
 589
 590                /*
 591                 * The map_entry from btt_read_map is stripped of any flag bits,
 592                 * so use the stripped out versions from the log as well for
 593                 * testing whether recovery is needed. For restoration, use the
 594                 * 'raw' version of the log entries as that captured what we
 595                 * were going to write originally.
 596                 */
 597                if ((log_newmap != map_entry) && (log_oldmap == map_entry)) {
 598                        /*
 599                         * Last transaction wrote the flog, but wasn't able
 600                         * to complete the map write. So fix up the map.
 601                         */
 602                        ret = btt_map_write(arena, le32_to_cpu(log_new.lba),
 603                                        le32_to_cpu(log_new.new_map), 0, 0, 0);
 604                        if (ret)
 605                                return ret;
 606                }
 607        }
 608
 609        return 0;
 610}
 611
 612static bool ent_is_padding(struct log_entry *ent)
 613{
 614        return (ent->lba == 0) && (ent->old_map == 0) && (ent->new_map == 0)
 615                && (ent->seq == 0);
 616}
 617
 618/*
 619 * Detecting valid log indices: We read a log group (see the comments in btt.h
 620 * for a description of a 'log_group' and its 'slots'), and iterate over its
 621 * four slots. We expect that a padding slot will be all-zeroes, and use this
 622 * to detect a padding slot vs. an actual entry.
 623 *
 624 * If a log_group is in the initial state, i.e. hasn't been used since the
 625 * creation of this BTT layout, it will have three of the four slots with
 626 * zeroes. We skip over these log_groups for the detection of log_index. If
 627 * all log_groups are in the initial state (i.e. the BTT has never been
 628 * written to), it is safe to assume the 'new format' of log entries in slots
 629 * (0, 1).
 630 */
 631static int log_set_indices(struct arena_info *arena)
 632{
 633        bool idx_set = false, initial_state = true;
 634        int ret, log_index[2] = {-1, -1};
 635        u32 i, j, next_idx = 0;
 636        struct log_group log;
 637        u32 pad_count = 0;
 638
 639        for (i = 0; i < arena->nfree; i++) {
 640                ret = btt_log_group_read(arena, i, &log);
 641                if (ret < 0)
 642                        return ret;
 643
 644                for (j = 0; j < 4; j++) {
 645                        if (!idx_set) {
 646                                if (ent_is_padding(&log.ent[j])) {
 647                                        pad_count++;
 648                                        continue;
 649                                } else {
 650                                        /* Skip if index has been recorded */
 651                                        if ((next_idx == 1) &&
 652                                                (j == log_index[0]))
 653                                                continue;
 654                                        /* valid entry, record index */
 655                                        log_index[next_idx] = j;
 656                                        next_idx++;
 657                                }
 658                                if (next_idx == 2) {
 659                                        /* two valid entries found */
 660                                        idx_set = true;
 661                                } else if (next_idx > 2) {
 662                                        /* too many valid indices */
 663                                        return -ENXIO;
 664                                }
 665                        } else {
 666                                /*
 667                                 * once the indices have been set, just verify
 668                                 * that all subsequent log groups are either in
 669                                 * their initial state or follow the same
 670                                 * indices.
 671                                 */
 672                                if (j == log_index[0]) {
 673                                        /* entry must be 'valid' */
 674                                        if (ent_is_padding(&log.ent[j]))
 675                                                return -ENXIO;
 676                                } else if (j == log_index[1]) {
 677                                        ;
 678                                        /*
 679                                         * log_index[1] can be padding if the
 680                                         * lane never got used and it is still
 681                                         * in the initial state (three 'padding'
 682                                         * entries)
 683                                         */
 684                                } else {
 685                                        /* entry must be invalid (padding) */
 686                                        if (!ent_is_padding(&log.ent[j]))
 687                                                return -ENXIO;
 688                                }
 689                        }
 690                }
 691                /*
 692                 * If any of the log_groups have more than one valid,
 693                 * non-padding entry, then the we are no longer in the
 694                 * initial_state
 695                 */
 696                if (pad_count < 3)
 697                        initial_state = false;
 698                pad_count = 0;
 699        }
 700
 701        if (!initial_state && !idx_set)
 702                return -ENXIO;
 703
 704        /*
 705         * If all the entries in the log were in the initial state,
 706         * assume new padding scheme
 707         */
 708        if (initial_state)
 709                log_index[1] = 1;
 710
 711        /*
 712         * Only allow the known permutations of log/padding indices,
 713         * i.e. (0, 1), and (0, 2)
 714         */
 715        if ((log_index[0] == 0) && ((log_index[1] == 1) || (log_index[1] == 2)))
 716                ; /* known index possibilities */
 717        else {
 718                dev_err(to_dev(arena), "Found an unknown padding scheme\n");
 719                return -ENXIO;
 720        }
 721
 722        arena->log_index[0] = log_index[0];
 723        arena->log_index[1] = log_index[1];
 724        dev_dbg(to_dev(arena), "log_index_0 = %d\n", log_index[0]);
 725        dev_dbg(to_dev(arena), "log_index_1 = %d\n", log_index[1]);
 726        return 0;
 727}
 728
 729static int btt_rtt_init(struct arena_info *arena)
 730{
 731        arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL);
 732        if (arena->rtt == NULL)
 733                return -ENOMEM;
 734
 735        return 0;
 736}
 737
 738static int btt_maplocks_init(struct arena_info *arena)
 739{
 740        u32 i;
 741
 742        arena->map_locks = kcalloc(arena->nfree, sizeof(struct aligned_lock),
 743                                GFP_KERNEL);
 744        if (!arena->map_locks)
 745                return -ENOMEM;
 746
 747        for (i = 0; i < arena->nfree; i++)
 748                spin_lock_init(&arena->map_locks[i].lock);
 749
 750        return 0;
 751}
 752
 753static struct arena_info *alloc_arena(struct btt *btt, size_t size,
 754                                size_t start, size_t arena_off)
 755{
 756        struct arena_info *arena;
 757        u64 logsize, mapsize, datasize;
 758        u64 available = size;
 759
 760        arena = kzalloc(sizeof(struct arena_info), GFP_KERNEL);
 761        if (!arena)
 762                return NULL;
 763        arena->nd_btt = btt->nd_btt;
 764        arena->sector_size = btt->sector_size;
 765        mutex_init(&arena->err_lock);
 766
 767        if (!size)
 768                return arena;
 769
 770        arena->size = size;
 771        arena->external_lba_start = start;
 772        arena->external_lbasize = btt->lbasize;
 773        arena->internal_lbasize = roundup(arena->external_lbasize,
 774                                        INT_LBASIZE_ALIGNMENT);
 775        arena->nfree = BTT_DEFAULT_NFREE;
 776        arena->version_major = btt->nd_btt->version_major;
 777        arena->version_minor = btt->nd_btt->version_minor;
 778
 779        if (available % BTT_PG_SIZE)
 780                available -= (available % BTT_PG_SIZE);
 781
 782        /* Two pages are reserved for the super block and its copy */
 783        available -= 2 * BTT_PG_SIZE;
 784
 785        /* The log takes a fixed amount of space based on nfree */
 786        logsize = roundup(arena->nfree * LOG_GRP_SIZE, BTT_PG_SIZE);
 787        available -= logsize;
 788
 789        /* Calculate optimal split between map and data area */
 790        arena->internal_nlba = div_u64(available - BTT_PG_SIZE,
 791                        arena->internal_lbasize + MAP_ENT_SIZE);
 792        arena->external_nlba = arena->internal_nlba - arena->nfree;
 793
 794        mapsize = roundup((arena->external_nlba * MAP_ENT_SIZE), BTT_PG_SIZE);
 795        datasize = available - mapsize;
 796
 797        /* 'Absolute' values, relative to start of storage space */
 798        arena->infooff = arena_off;
 799        arena->dataoff = arena->infooff + BTT_PG_SIZE;
 800        arena->mapoff = arena->dataoff + datasize;
 801        arena->logoff = arena->mapoff + mapsize;
 802        arena->info2off = arena->logoff + logsize;
 803
 804        /* Default log indices are (0,1) */
 805        arena->log_index[0] = 0;
 806        arena->log_index[1] = 1;
 807        return arena;
 808}
 809
 810static void free_arenas(struct btt *btt)
 811{
 812        struct arena_info *arena, *next;
 813
 814        list_for_each_entry_safe(arena, next, &btt->arena_list, list) {
 815                list_del(&arena->list);
 816                kfree(arena->rtt);
 817                kfree(arena->map_locks);
 818                kfree(arena->freelist);
 819                debugfs_remove_recursive(arena->debugfs_dir);
 820                kfree(arena);
 821        }
 822}
 823
 824/*
 825 * This function reads an existing valid btt superblock and
 826 * populates the corresponding arena_info struct
 827 */
 828static void parse_arena_meta(struct arena_info *arena, struct btt_sb *super,
 829                                u64 arena_off)
 830{
 831        arena->internal_nlba = le32_to_cpu(super->internal_nlba);
 832        arena->internal_lbasize = le32_to_cpu(super->internal_lbasize);
 833        arena->external_nlba = le32_to_cpu(super->external_nlba);
 834        arena->external_lbasize = le32_to_cpu(super->external_lbasize);
 835        arena->nfree = le32_to_cpu(super->nfree);
 836        arena->version_major = le16_to_cpu(super->version_major);
 837        arena->version_minor = le16_to_cpu(super->version_minor);
 838
 839        arena->nextoff = (super->nextoff == 0) ? 0 : (arena_off +
 840                        le64_to_cpu(super->nextoff));
 841        arena->infooff = arena_off;
 842        arena->dataoff = arena_off + le64_to_cpu(super->dataoff);
 843        arena->mapoff = arena_off + le64_to_cpu(super->mapoff);
 844        arena->logoff = arena_off + le64_to_cpu(super->logoff);
 845        arena->info2off = arena_off + le64_to_cpu(super->info2off);
 846
 847        arena->size = (le64_to_cpu(super->nextoff) > 0)
 848                ? (le64_to_cpu(super->nextoff))
 849                : (arena->info2off - arena->infooff + BTT_PG_SIZE);
 850
 851        arena->flags = le32_to_cpu(super->flags);
 852}
 853
 854static int discover_arenas(struct btt *btt)
 855{
 856        int ret = 0;
 857        struct arena_info *arena;
 858        struct btt_sb *super;
 859        size_t remaining = btt->rawsize;
 860        u64 cur_nlba = 0;
 861        size_t cur_off = 0;
 862        int num_arenas = 0;
 863
 864        super = kzalloc(sizeof(*super), GFP_KERNEL);
 865        if (!super)
 866                return -ENOMEM;
 867
 868        while (remaining) {
 869                /* Alloc memory for arena */
 870                arena = alloc_arena(btt, 0, 0, 0);
 871                if (!arena) {
 872                        ret = -ENOMEM;
 873                        goto out_super;
 874                }
 875
 876                arena->infooff = cur_off;
 877                ret = btt_info_read(arena, super);
 878                if (ret)
 879                        goto out;
 880
 881                if (!nd_btt_arena_is_valid(btt->nd_btt, super)) {
 882                        if (remaining == btt->rawsize) {
 883                                btt->init_state = INIT_NOTFOUND;
 884                                dev_info(to_dev(arena), "No existing arenas\n");
 885                                goto out;
 886                        } else {
 887                                dev_err(to_dev(arena),
 888                                                "Found corrupted metadata!\n");
 889                                ret = -ENODEV;
 890                                goto out;
 891                        }
 892                }
 893
 894                arena->external_lba_start = cur_nlba;
 895                parse_arena_meta(arena, super, cur_off);
 896
 897                ret = log_set_indices(arena);
 898                if (ret) {
 899                        dev_err(to_dev(arena),
 900                                "Unable to deduce log/padding indices\n");
 901                        goto out;
 902                }
 903
 904                ret = btt_freelist_init(arena);
 905                if (ret)
 906                        goto out;
 907
 908                ret = btt_rtt_init(arena);
 909                if (ret)
 910                        goto out;
 911
 912                ret = btt_maplocks_init(arena);
 913                if (ret)
 914                        goto out;
 915
 916                list_add_tail(&arena->list, &btt->arena_list);
 917
 918                remaining -= arena->size;
 919                cur_off += arena->size;
 920                cur_nlba += arena->external_nlba;
 921                num_arenas++;
 922
 923                if (arena->nextoff == 0)
 924                        break;
 925        }
 926        btt->num_arenas = num_arenas;
 927        btt->nlba = cur_nlba;
 928        btt->init_state = INIT_READY;
 929
 930        kfree(super);
 931        return ret;
 932
 933 out:
 934        kfree(arena);
 935        free_arenas(btt);
 936 out_super:
 937        kfree(super);
 938        return ret;
 939}
 940
 941static int create_arenas(struct btt *btt)
 942{
 943        size_t remaining = btt->rawsize;
 944        size_t cur_off = 0;
 945
 946        while (remaining) {
 947                struct arena_info *arena;
 948                size_t arena_size = min_t(u64, ARENA_MAX_SIZE, remaining);
 949
 950                remaining -= arena_size;
 951                if (arena_size < ARENA_MIN_SIZE)
 952                        break;
 953
 954                arena = alloc_arena(btt, arena_size, btt->nlba, cur_off);
 955                if (!arena) {
 956                        free_arenas(btt);
 957                        return -ENOMEM;
 958                }
 959                btt->nlba += arena->external_nlba;
 960                if (remaining >= ARENA_MIN_SIZE)
 961                        arena->nextoff = arena->size;
 962                else
 963                        arena->nextoff = 0;
 964                cur_off += arena_size;
 965                list_add_tail(&arena->list, &btt->arena_list);
 966        }
 967
 968        return 0;
 969}
 970
 971/*
 972 * This function completes arena initialization by writing
 973 * all the metadata.
 974 * It is only called for an uninitialized arena when a write
 975 * to that arena occurs for the first time.
 976 */
 977static int btt_arena_write_layout(struct arena_info *arena)
 978{
 979        int ret;
 980        u64 sum;
 981        struct btt_sb *super;
 982        struct nd_btt *nd_btt = arena->nd_btt;
 983        const u8 *parent_uuid = nd_dev_to_uuid(&nd_btt->ndns->dev);
 984
 985        ret = btt_map_init(arena);
 986        if (ret)
 987                return ret;
 988
 989        ret = btt_log_init(arena);
 990        if (ret)
 991                return ret;
 992
 993        super = kzalloc(sizeof(struct btt_sb), GFP_NOIO);
 994        if (!super)
 995                return -ENOMEM;
 996
 997        strncpy(super->signature, BTT_SIG, BTT_SIG_LEN);
 998        memcpy(super->uuid, nd_btt->uuid, 16);
 999        memcpy(super->parent_uuid, parent_uuid, 16);
1000        super->flags = cpu_to_le32(arena->flags);
1001        super->version_major = cpu_to_le16(arena->version_major);
1002        super->version_minor = cpu_to_le16(arena->version_minor);
1003        super->external_lbasize = cpu_to_le32(arena->external_lbasize);
1004        super->external_nlba = cpu_to_le32(arena->external_nlba);
1005        super->internal_lbasize = cpu_to_le32(arena->internal_lbasize);
1006        super->internal_nlba = cpu_to_le32(arena->internal_nlba);
1007        super->nfree = cpu_to_le32(arena->nfree);
1008        super->infosize = cpu_to_le32(sizeof(struct btt_sb));
1009        super->nextoff = cpu_to_le64(arena->nextoff);
1010        /*
1011         * Subtract arena->infooff (arena start) so numbers are relative
1012         * to 'this' arena
1013         */
1014        super->dataoff = cpu_to_le64(arena->dataoff - arena->infooff);
1015        super->mapoff = cpu_to_le64(arena->mapoff - arena->infooff);
1016        super->logoff = cpu_to_le64(arena->logoff - arena->infooff);
1017        super->info2off = cpu_to_le64(arena->info2off - arena->infooff);
1018
1019        super->flags = 0;
1020        sum = nd_sb_checksum((struct nd_gen_sb *) super);
1021        super->checksum = cpu_to_le64(sum);
1022
1023        ret = btt_info_write(arena, super);
1024
1025        kfree(super);
1026        return ret;
1027}
1028
1029/*
1030 * This function completes the initialization for the BTT namespace
1031 * such that it is ready to accept IOs
1032 */
1033static int btt_meta_init(struct btt *btt)
1034{
1035        int ret = 0;
1036        struct arena_info *arena;
1037
1038        mutex_lock(&btt->init_lock);
1039        list_for_each_entry(arena, &btt->arena_list, list) {
1040                ret = btt_arena_write_layout(arena);
1041                if (ret)
1042                        goto unlock;
1043
1044                ret = btt_freelist_init(arena);
1045                if (ret)
1046                        goto unlock;
1047
1048                ret = btt_rtt_init(arena);
1049                if (ret)
1050                        goto unlock;
1051
1052                ret = btt_maplocks_init(arena);
1053                if (ret)
1054                        goto unlock;
1055        }
1056
1057        btt->init_state = INIT_READY;
1058
1059 unlock:
1060        mutex_unlock(&btt->init_lock);
1061        return ret;
1062}
1063
1064static u32 btt_meta_size(struct btt *btt)
1065{
1066        return btt->lbasize - btt->sector_size;
1067}
1068
1069/*
1070 * This function calculates the arena in which the given LBA lies
1071 * by doing a linear walk. This is acceptable since we expect only
1072 * a few arenas. If we have backing devices that get much larger,
1073 * we can construct a balanced binary tree of arenas at init time
1074 * so that this range search becomes faster.
1075 */
1076static int lba_to_arena(struct btt *btt, sector_t sector, __u32 *premap,
1077                                struct arena_info **arena)
1078{
1079        struct arena_info *arena_list;
1080        __u64 lba = div_u64(sector << SECTOR_SHIFT, btt->sector_size);
1081
1082        list_for_each_entry(arena_list, &btt->arena_list, list) {
1083                if (lba < arena_list->external_nlba) {
1084                        *arena = arena_list;
1085                        *premap = lba;
1086                        return 0;
1087                }
1088                lba -= arena_list->external_nlba;
1089        }
1090
1091        return -EIO;
1092}
1093
1094/*
1095 * The following (lock_map, unlock_map) are mostly just to improve
1096 * readability, since they index into an array of locks
1097 */
1098static void lock_map(struct arena_info *arena, u32 premap)
1099                __acquires(&arena->map_locks[idx].lock)
1100{
1101        u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree;
1102
1103        spin_lock(&arena->map_locks[idx].lock);
1104}
1105
1106static void unlock_map(struct arena_info *arena, u32 premap)
1107                __releases(&arena->map_locks[idx].lock)
1108{
1109        u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree;
1110
1111        spin_unlock(&arena->map_locks[idx].lock);
1112}
1113
1114static int btt_data_read(struct arena_info *arena, struct page *page,
1115                        unsigned int off, u32 lba, u32 len)
1116{
1117        int ret;
1118        u64 nsoff = to_namespace_offset(arena, lba);
1119        void *mem = kmap_atomic(page);
1120
1121        ret = arena_read_bytes(arena, nsoff, mem + off, len, NVDIMM_IO_ATOMIC);
1122        kunmap_atomic(mem);
1123
1124        return ret;
1125}
1126
1127static int btt_data_write(struct arena_info *arena, u32 lba,
1128                        struct page *page, unsigned int off, u32 len)
1129{
1130        int ret;
1131        u64 nsoff = to_namespace_offset(arena, lba);
1132        void *mem = kmap_atomic(page);
1133
1134        ret = arena_write_bytes(arena, nsoff, mem + off, len, NVDIMM_IO_ATOMIC);
1135        kunmap_atomic(mem);
1136
1137        return ret;
1138}
1139
1140static void zero_fill_data(struct page *page, unsigned int off, u32 len)
1141{
1142        void *mem = kmap_atomic(page);
1143
1144        memset(mem + off, 0, len);
1145        kunmap_atomic(mem);
1146}
1147
1148#ifdef CONFIG_BLK_DEV_INTEGRITY
1149static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip,
1150                        struct arena_info *arena, u32 postmap, int rw)
1151{
1152        unsigned int len = btt_meta_size(btt);
1153        u64 meta_nsoff;
1154        int ret = 0;
1155
1156        if (bip == NULL)
1157                return 0;
1158
1159        meta_nsoff = to_namespace_offset(arena, postmap) + btt->sector_size;
1160
1161        while (len) {
1162                unsigned int cur_len;
1163                struct bio_vec bv;
1164                void *mem;
1165
1166                bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
1167                /*
1168                 * The 'bv' obtained from bvec_iter_bvec has its .bv_len and
1169                 * .bv_offset already adjusted for iter->bi_bvec_done, and we
1170                 * can use those directly
1171                 */
1172
1173                cur_len = min(len, bv.bv_len);
1174                mem = kmap_atomic(bv.bv_page);
1175                if (rw)
1176                        ret = arena_write_bytes(arena, meta_nsoff,
1177                                        mem + bv.bv_offset, cur_len,
1178                                        NVDIMM_IO_ATOMIC);
1179                else
1180                        ret = arena_read_bytes(arena, meta_nsoff,
1181                                        mem + bv.bv_offset, cur_len,
1182                                        NVDIMM_IO_ATOMIC);
1183
1184                kunmap_atomic(mem);
1185                if (ret)
1186                        return ret;
1187
1188                len -= cur_len;
1189                meta_nsoff += cur_len;
1190                if (!bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len))
1191                        return -EIO;
1192        }
1193
1194        return ret;
1195}
1196
1197#else /* CONFIG_BLK_DEV_INTEGRITY */
1198static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip,
1199                        struct arena_info *arena, u32 postmap, int rw)
1200{
1201        return 0;
1202}
1203#endif
1204
1205static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip,
1206                        struct page *page, unsigned int off, sector_t sector,
1207                        unsigned int len)
1208{
1209        int ret = 0;
1210        int t_flag, e_flag;
1211        struct arena_info *arena = NULL;
1212        u32 lane = 0, premap, postmap;
1213
1214        while (len) {
1215                u32 cur_len;
1216
1217                lane = nd_region_acquire_lane(btt->nd_region);
1218
1219                ret = lba_to_arena(btt, sector, &premap, &arena);
1220                if (ret)
1221                        goto out_lane;
1222
1223                cur_len = min(btt->sector_size, len);
1224
1225                ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag,
1226                                NVDIMM_IO_ATOMIC);
1227                if (ret)
1228                        goto out_lane;
1229
1230                /*
1231                 * We loop to make sure that the post map LBA didn't change
1232                 * from under us between writing the RTT and doing the actual
1233                 * read.
1234                 */
1235                while (1) {
1236                        u32 new_map;
1237                        int new_t, new_e;
1238
1239                        if (t_flag) {
1240                                zero_fill_data(page, off, cur_len);
1241                                goto out_lane;
1242                        }
1243
1244                        if (e_flag) {
1245                                ret = -EIO;
1246                                goto out_lane;
1247                        }
1248
1249                        arena->rtt[lane] = RTT_VALID | postmap;
1250                        /*
1251                         * Barrier to make sure this write is not reordered
1252                         * to do the verification map_read before the RTT store
1253                         */
1254                        barrier();
1255
1256                        ret = btt_map_read(arena, premap, &new_map, &new_t,
1257                                                &new_e, NVDIMM_IO_ATOMIC);
1258                        if (ret)
1259                                goto out_rtt;
1260
1261                        if ((postmap == new_map) && (t_flag == new_t) &&
1262                                        (e_flag == new_e))
1263                                break;
1264
1265                        postmap = new_map;
1266                        t_flag = new_t;
1267                        e_flag = new_e;
1268                }
1269
1270                ret = btt_data_read(arena, page, off, postmap, cur_len);
1271                if (ret) {
1272                        int rc;
1273
1274                        /* Media error - set the e_flag */
1275                        rc = btt_map_write(arena, premap, postmap, 0, 1,
1276                                NVDIMM_IO_ATOMIC);
1277                        goto out_rtt;
1278                }
1279
1280                if (bip) {
1281                        ret = btt_rw_integrity(btt, bip, arena, postmap, READ);
1282                        if (ret)
1283                                goto out_rtt;
1284                }
1285
1286                arena->rtt[lane] = RTT_INVALID;
1287                nd_region_release_lane(btt->nd_region, lane);
1288
1289                len -= cur_len;
1290                off += cur_len;
1291                sector += btt->sector_size >> SECTOR_SHIFT;
1292        }
1293
1294        return 0;
1295
1296 out_rtt:
1297        arena->rtt[lane] = RTT_INVALID;
1298 out_lane:
1299        nd_region_release_lane(btt->nd_region, lane);
1300        return ret;
1301}
1302
1303/*
1304 * Normally, arena_{read,write}_bytes will take care of the initial offset
1305 * adjustment, but in the case of btt_is_badblock, where we query is_bad_pmem,
1306 * we need the final, raw namespace offset here
1307 */
1308static bool btt_is_badblock(struct btt *btt, struct arena_info *arena,
1309                u32 postmap)
1310{
1311        u64 nsoff = adjust_initial_offset(arena->nd_btt,
1312                        to_namespace_offset(arena, postmap));
1313        sector_t phys_sector = nsoff >> 9;
1314
1315        return is_bad_pmem(btt->phys_bb, phys_sector, arena->internal_lbasize);
1316}
1317
1318static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
1319                        sector_t sector, struct page *page, unsigned int off,
1320                        unsigned int len)
1321{
1322        int ret = 0;
1323        struct arena_info *arena = NULL;
1324        u32 premap = 0, old_postmap, new_postmap, lane = 0, i;
1325        struct log_entry log;
1326        int sub;
1327
1328        while (len) {
1329                u32 cur_len;
1330                int e_flag;
1331
1332 retry:
1333                lane = nd_region_acquire_lane(btt->nd_region);
1334
1335                ret = lba_to_arena(btt, sector, &premap, &arena);
1336                if (ret)
1337                        goto out_lane;
1338                cur_len = min(btt->sector_size, len);
1339
1340                if ((arena->flags & IB_FLAG_ERROR_MASK) != 0) {
1341                        ret = -EIO;
1342                        goto out_lane;
1343                }
1344
1345                if (btt_is_badblock(btt, arena, arena->freelist[lane].block))
1346                        arena->freelist[lane].has_err = 1;
1347
1348                if (mutex_is_locked(&arena->err_lock)
1349                                || arena->freelist[lane].has_err) {
1350                        nd_region_release_lane(btt->nd_region, lane);
1351
1352                        ret = arena_clear_freelist_error(arena, lane);
1353                        if (ret)
1354                                return ret;
1355
1356                        /* OK to acquire a different lane/free block */
1357                        goto retry;
1358                }
1359
1360                new_postmap = arena->freelist[lane].block;
1361
1362                /* Wait if the new block is being read from */
1363                for (i = 0; i < arena->nfree; i++)
1364                        while (arena->rtt[i] == (RTT_VALID | new_postmap))
1365                                cpu_relax();
1366
1367
1368                if (new_postmap >= arena->internal_nlba) {
1369                        ret = -EIO;
1370                        goto out_lane;
1371                }
1372
1373                ret = btt_data_write(arena, new_postmap, page, off, cur_len);
1374                if (ret)
1375                        goto out_lane;
1376
1377                if (bip) {
1378                        ret = btt_rw_integrity(btt, bip, arena, new_postmap,
1379                                                WRITE);
1380                        if (ret)
1381                                goto out_lane;
1382                }
1383
1384                lock_map(arena, premap);
1385                ret = btt_map_read(arena, premap, &old_postmap, NULL, &e_flag,
1386                                NVDIMM_IO_ATOMIC);
1387                if (ret)
1388                        goto out_map;
1389                if (old_postmap >= arena->internal_nlba) {
1390                        ret = -EIO;
1391                        goto out_map;
1392                }
1393                if (e_flag)
1394                        set_e_flag(old_postmap);
1395
1396                log.lba = cpu_to_le32(premap);
1397                log.old_map = cpu_to_le32(old_postmap);
1398                log.new_map = cpu_to_le32(new_postmap);
1399                log.seq = cpu_to_le32(arena->freelist[lane].seq);
1400                sub = arena->freelist[lane].sub;
1401                ret = btt_flog_write(arena, lane, sub, &log);
1402                if (ret)
1403                        goto out_map;
1404
1405                ret = btt_map_write(arena, premap, new_postmap, 0, 0,
1406                        NVDIMM_IO_ATOMIC);
1407                if (ret)
1408                        goto out_map;
1409
1410                unlock_map(arena, premap);
1411                nd_region_release_lane(btt->nd_region, lane);
1412
1413                if (e_flag) {
1414                        ret = arena_clear_freelist_error(arena, lane);
1415                        if (ret)
1416                                return ret;
1417                }
1418
1419                len -= cur_len;
1420                off += cur_len;
1421                sector += btt->sector_size >> SECTOR_SHIFT;
1422        }
1423
1424        return 0;
1425
1426 out_map:
1427        unlock_map(arena, premap);
1428 out_lane:
1429        nd_region_release_lane(btt->nd_region, lane);
1430        return ret;
1431}
1432
1433static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip,
1434                        struct page *page, unsigned int len, unsigned int off,
1435                        unsigned int op, sector_t sector)
1436{
1437        int ret;
1438
1439        if (!op_is_write(op)) {
1440                ret = btt_read_pg(btt, bip, page, off, sector, len);
1441                flush_dcache_page(page);
1442        } else {
1443                flush_dcache_page(page);
1444                ret = btt_write_pg(btt, bip, sector, page, off, len);
1445        }
1446
1447        return ret;
1448}
1449
1450static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
1451{
1452        struct bio_integrity_payload *bip = bio_integrity(bio);
1453        struct btt *btt = q->queuedata;
1454        struct bvec_iter iter;
1455        unsigned long start;
1456        struct bio_vec bvec;
1457        int err = 0;
1458        bool do_acct;
1459
1460        if (!bio_integrity_prep(bio))
1461                return BLK_QC_T_NONE;
1462
1463        do_acct = nd_iostat_start(bio, &start);
1464        bio_for_each_segment(bvec, bio, iter) {
1465                unsigned int len = bvec.bv_len;
1466
1467                if (len > PAGE_SIZE || len < btt->sector_size ||
1468                                len % btt->sector_size) {
1469                        dev_err_ratelimited(&btt->nd_btt->dev,
1470                                "unaligned bio segment (len: %d)\n", len);
1471                        bio->bi_status = BLK_STS_IOERR;
1472                        break;
1473                }
1474
1475                err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset,
1476                                  bio_op(bio), iter.bi_sector);
1477                if (err) {
1478                        dev_err(&btt->nd_btt->dev,
1479                                        "io error in %s sector %lld, len %d,\n",
1480                                        (op_is_write(bio_op(bio))) ? "WRITE" :
1481                                        "READ",
1482                                        (unsigned long long) iter.bi_sector, len);
1483                        bio->bi_status = errno_to_blk_status(err);
1484                        break;
1485                }
1486        }
1487        if (do_acct)
1488                nd_iostat_end(bio, start);
1489
1490        bio_endio(bio);
1491        return BLK_QC_T_NONE;
1492}
1493
1494static int btt_rw_page(struct block_device *bdev, sector_t sector,
1495                struct page *page, unsigned int op)
1496{
1497        struct btt *btt = bdev->bd_disk->private_data;
1498        int rc;
1499        unsigned int len;
1500
1501        len = hpage_nr_pages(page) * PAGE_SIZE;
1502        rc = btt_do_bvec(btt, NULL, page, len, 0, op, sector);
1503        if (rc == 0)
1504                page_endio(page, op_is_write(op), 0);
1505
1506        return rc;
1507}
1508
1509
1510static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo)
1511{
1512        /* some standard values */
1513        geo->heads = 1 << 6;
1514        geo->sectors = 1 << 5;
1515        geo->cylinders = get_capacity(bd->bd_disk) >> 11;
1516        return 0;
1517}
1518
1519static const struct block_device_operations btt_fops = {
1520        .owner =                THIS_MODULE,
1521        .rw_page =              btt_rw_page,
1522        .getgeo =               btt_getgeo,
1523        .revalidate_disk =      nvdimm_revalidate_disk,
1524};
1525
1526static int btt_blk_init(struct btt *btt)
1527{
1528        struct nd_btt *nd_btt = btt->nd_btt;
1529        struct nd_namespace_common *ndns = nd_btt->ndns;
1530
1531        /* create a new disk and request queue for btt */
1532        btt->btt_queue = blk_alloc_queue(GFP_KERNEL);
1533        if (!btt->btt_queue)
1534                return -ENOMEM;
1535
1536        btt->btt_disk = alloc_disk(0);
1537        if (!btt->btt_disk) {
1538                blk_cleanup_queue(btt->btt_queue);
1539                return -ENOMEM;
1540        }
1541
1542        nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name);
1543        btt->btt_disk->first_minor = 0;
1544        btt->btt_disk->fops = &btt_fops;
1545        btt->btt_disk->private_data = btt;
1546        btt->btt_disk->queue = btt->btt_queue;
1547        btt->btt_disk->flags = GENHD_FL_EXT_DEVT;
1548        btt->btt_disk->queue->backing_dev_info->capabilities |=
1549                        BDI_CAP_SYNCHRONOUS_IO;
1550
1551        blk_queue_make_request(btt->btt_queue, btt_make_request);
1552        blk_queue_logical_block_size(btt->btt_queue, btt->sector_size);
1553        blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX);
1554        blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_queue);
1555        btt->btt_queue->queuedata = btt;
1556
1557        if (btt_meta_size(btt)) {
1558                int rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt));
1559
1560                if (rc) {
1561                        del_gendisk(btt->btt_disk);
1562                        put_disk(btt->btt_disk);
1563                        blk_cleanup_queue(btt->btt_queue);
1564                        return rc;
1565                }
1566        }
1567        set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9);
1568        device_add_disk(&btt->nd_btt->dev, btt->btt_disk, NULL);
1569        btt->nd_btt->size = btt->nlba * (u64)btt->sector_size;
1570        revalidate_disk(btt->btt_disk);
1571
1572        return 0;
1573}
1574
1575static void btt_blk_cleanup(struct btt *btt)
1576{
1577        del_gendisk(btt->btt_disk);
1578        put_disk(btt->btt_disk);
1579        blk_cleanup_queue(btt->btt_queue);
1580}
1581
1582/**
1583 * btt_init - initialize a block translation table for the given device
1584 * @nd_btt:     device with BTT geometry and backing device info
1585 * @rawsize:    raw size in bytes of the backing device
1586 * @lbasize:    lba size of the backing device
1587 * @uuid:       A uuid for the backing device - this is stored on media
1588 * @maxlane:    maximum number of parallel requests the device can handle
1589 *
1590 * Initialize a Block Translation Table on a backing device to provide
1591 * single sector power fail atomicity.
1592 *
1593 * Context:
1594 * Might sleep.
1595 *
1596 * Returns:
1597 * Pointer to a new struct btt on success, NULL on failure.
1598 */
1599static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize,
1600                u32 lbasize, u8 *uuid, struct nd_region *nd_region)
1601{
1602        int ret;
1603        struct btt *btt;
1604        struct nd_namespace_io *nsio;
1605        struct device *dev = &nd_btt->dev;
1606
1607        btt = devm_kzalloc(dev, sizeof(struct btt), GFP_KERNEL);
1608        if (!btt)
1609                return NULL;
1610
1611        btt->nd_btt = nd_btt;
1612        btt->rawsize = rawsize;
1613        btt->lbasize = lbasize;
1614        btt->sector_size = ((lbasize >= 4096) ? 4096 : 512);
1615        INIT_LIST_HEAD(&btt->arena_list);
1616        mutex_init(&btt->init_lock);
1617        btt->nd_region = nd_region;
1618        nsio = to_nd_namespace_io(&nd_btt->ndns->dev);
1619        btt->phys_bb = &nsio->bb;
1620
1621        ret = discover_arenas(btt);
1622        if (ret) {
1623                dev_err(dev, "init: error in arena_discover: %d\n", ret);
1624                return NULL;
1625        }
1626
1627        if (btt->init_state != INIT_READY && nd_region->ro) {
1628                dev_warn(dev, "%s is read-only, unable to init btt metadata\n",
1629                                dev_name(&nd_region->dev));
1630                return NULL;
1631        } else if (btt->init_state != INIT_READY) {
1632                btt->num_arenas = (rawsize / ARENA_MAX_SIZE) +
1633                        ((rawsize % ARENA_MAX_SIZE) ? 1 : 0);
1634                dev_dbg(dev, "init: %d arenas for %llu rawsize\n",
1635                                btt->num_arenas, rawsize);
1636
1637                ret = create_arenas(btt);
1638                if (ret) {
1639                        dev_info(dev, "init: create_arenas: %d\n", ret);
1640                        return NULL;
1641                }
1642
1643                ret = btt_meta_init(btt);
1644                if (ret) {
1645                        dev_err(dev, "init: error in meta_init: %d\n", ret);
1646                        return NULL;
1647                }
1648        }
1649
1650        ret = btt_blk_init(btt);
1651        if (ret) {
1652                dev_err(dev, "init: error in blk_init: %d\n", ret);
1653                return NULL;
1654        }
1655
1656        btt_debugfs_init(btt);
1657
1658        return btt;
1659}
1660
1661/**
1662 * btt_fini - de-initialize a BTT
1663 * @btt:        the BTT handle that was generated by btt_init
1664 *
1665 * De-initialize a Block Translation Table on device removal
1666 *
1667 * Context:
1668 * Might sleep.
1669 */
1670static void btt_fini(struct btt *btt)
1671{
1672        if (btt) {
1673                btt_blk_cleanup(btt);
1674                free_arenas(btt);
1675                debugfs_remove_recursive(btt->debugfs_dir);
1676        }
1677}
1678
1679int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns)
1680{
1681        struct nd_btt *nd_btt = to_nd_btt(ndns->claim);
1682        struct nd_region *nd_region;
1683        struct btt_sb *btt_sb;
1684        struct btt *btt;
1685        size_t rawsize;
1686
1687        if (!nd_btt->uuid || !nd_btt->ndns || !nd_btt->lbasize) {
1688                dev_dbg(&nd_btt->dev, "incomplete btt configuration\n");
1689                return -ENODEV;
1690        }
1691
1692        btt_sb = devm_kzalloc(&nd_btt->dev, sizeof(*btt_sb), GFP_KERNEL);
1693        if (!btt_sb)
1694                return -ENOMEM;
1695
1696        /*
1697         * If this returns < 0, that is ok as it just means there wasn't
1698         * an existing BTT, and we're creating a new one. We still need to
1699         * call this as we need the version dependent fields in nd_btt to be
1700         * set correctly based on the holder class
1701         */
1702        nd_btt_version(nd_btt, ndns, btt_sb);
1703
1704        rawsize = nvdimm_namespace_capacity(ndns) - nd_btt->initial_offset;
1705        if (rawsize < ARENA_MIN_SIZE) {
1706                dev_dbg(&nd_btt->dev, "%s must be at least %ld bytes\n",
1707                                dev_name(&ndns->dev),
1708                                ARENA_MIN_SIZE + nd_btt->initial_offset);
1709                return -ENXIO;
1710        }
1711        nd_region = to_nd_region(nd_btt->dev.parent);
1712        btt = btt_init(nd_btt, rawsize, nd_btt->lbasize, nd_btt->uuid,
1713                        nd_region);
1714        if (!btt)
1715                return -ENOMEM;
1716        nd_btt->btt = btt;
1717
1718        return 0;
1719}
1720EXPORT_SYMBOL(nvdimm_namespace_attach_btt);
1721
1722int nvdimm_namespace_detach_btt(struct nd_btt *nd_btt)
1723{
1724        struct btt *btt = nd_btt->btt;
1725
1726        btt_fini(btt);
1727        nd_btt->btt = NULL;
1728
1729        return 0;
1730}
1731EXPORT_SYMBOL(nvdimm_namespace_detach_btt);
1732
1733static int __init nd_btt_init(void)
1734{
1735        int rc = 0;
1736
1737        debugfs_root = debugfs_create_dir("btt", NULL);
1738        if (IS_ERR_OR_NULL(debugfs_root))
1739                rc = -ENXIO;
1740
1741        return rc;
1742}
1743
1744static void __exit nd_btt_exit(void)
1745{
1746        debugfs_remove_recursive(debugfs_root);
1747}
1748
1749MODULE_ALIAS_ND_DEVICE(ND_DEVICE_BTT);
1750MODULE_AUTHOR("Vishal Verma <vishal.l.verma@linux.intel.com>");
1751MODULE_LICENSE("GPL v2");
1752module_init(nd_btt_init);
1753module_exit(nd_btt_exit);
1754