linux/drivers/nvdimm/btt.c
<<
>>
Prefs
   1/*
   2 * Block Translation Table
   3 * Copyright (c) 2014-2015, Intel Corporation.
   4 *
   5 * This program is free software; you can redistribute it and/or modify it
   6 * under the terms and conditions of the GNU General Public License,
   7 * version 2, as published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope it will be useful, but WITHOUT
  10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  12 * more details.
  13 */
  14#include <linux/highmem.h>
  15#include <linux/debugfs.h>
  16#include <linux/blkdev.h>
  17#include <linux/module.h>
  18#include <linux/device.h>
  19#include <linux/mutex.h>
  20#include <linux/hdreg.h>
  21#include <linux/genhd.h>
  22#include <linux/sizes.h>
  23#include <linux/ndctl.h>
  24#include <linux/fs.h>
  25#include <linux/nd.h>
  26#include <linux/backing-dev.h>
  27#include "btt.h"
  28#include "nd.h"
  29
  30enum log_ent_request {
  31        LOG_NEW_ENT = 0,
  32        LOG_OLD_ENT
  33};
  34
  35static struct device *to_dev(struct arena_info *arena)
  36{
  37        return &arena->nd_btt->dev;
  38}
  39
  40static u64 adjust_initial_offset(struct nd_btt *nd_btt, u64 offset)
  41{
  42        return offset + nd_btt->initial_offset;
  43}
  44
  45static int arena_read_bytes(struct arena_info *arena, resource_size_t offset,
  46                void *buf, size_t n, unsigned long flags)
  47{
  48        struct nd_btt *nd_btt = arena->nd_btt;
  49        struct nd_namespace_common *ndns = nd_btt->ndns;
  50
  51        /* arena offsets may be shifted from the base of the device */
  52        offset = adjust_initial_offset(nd_btt, offset);
  53        return nvdimm_read_bytes(ndns, offset, buf, n, flags);
  54}
  55
  56static int arena_write_bytes(struct arena_info *arena, resource_size_t offset,
  57                void *buf, size_t n, unsigned long flags)
  58{
  59        struct nd_btt *nd_btt = arena->nd_btt;
  60        struct nd_namespace_common *ndns = nd_btt->ndns;
  61
  62        /* arena offsets may be shifted from the base of the device */
  63        offset = adjust_initial_offset(nd_btt, offset);
  64        return nvdimm_write_bytes(ndns, offset, buf, n, flags);
  65}
  66
  67static int btt_info_write(struct arena_info *arena, struct btt_sb *super)
  68{
  69        int ret;
  70
  71        /*
  72         * infooff and info2off should always be at least 512B aligned.
  73         * We rely on that to make sure rw_bytes does error clearing
  74         * correctly, so make sure that is the case.
  75         */
  76        dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->infooff, 512),
  77                "arena->infooff: %#llx is unaligned\n", arena->infooff);
  78        dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->info2off, 512),
  79                "arena->info2off: %#llx is unaligned\n", arena->info2off);
  80
  81        ret = arena_write_bytes(arena, arena->info2off, super,
  82                        sizeof(struct btt_sb), 0);
  83        if (ret)
  84                return ret;
  85
  86        return arena_write_bytes(arena, arena->infooff, super,
  87                        sizeof(struct btt_sb), 0);
  88}
  89
  90static int btt_info_read(struct arena_info *arena, struct btt_sb *super)
  91{
  92        return arena_read_bytes(arena, arena->infooff, super,
  93                        sizeof(struct btt_sb), 0);
  94}
  95
  96/*
  97 * 'raw' version of btt_map write
  98 * Assumptions:
  99 *   mapping is in little-endian
 100 *   mapping contains 'E' and 'Z' flags as desired
 101 */
 102static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping,
 103                unsigned long flags)
 104{
 105        u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
 106
 107        if (unlikely(lba >= arena->external_nlba))
 108                dev_err_ratelimited(to_dev(arena),
 109                        "%s: lba %#x out of range (max: %#x)\n",
 110                        __func__, lba, arena->external_nlba);
 111        return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE, flags);
 112}
 113
 114static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping,
 115                        u32 z_flag, u32 e_flag, unsigned long rwb_flags)
 116{
 117        u32 ze;
 118        __le32 mapping_le;
 119
 120        /*
 121         * This 'mapping' is supposed to be just the LBA mapping, without
 122         * any flags set, so strip the flag bits.
 123         */
 124        mapping = ent_lba(mapping);
 125
 126        ze = (z_flag << 1) + e_flag;
 127        switch (ze) {
 128        case 0:
 129                /*
 130                 * We want to set neither of the Z or E flags, and
 131                 * in the actual layout, this means setting the bit
 132                 * positions of both to '1' to indicate a 'normal'
 133                 * map entry
 134                 */
 135                mapping |= MAP_ENT_NORMAL;
 136                break;
 137        case 1:
 138                mapping |= (1 << MAP_ERR_SHIFT);
 139                break;
 140        case 2:
 141                mapping |= (1 << MAP_TRIM_SHIFT);
 142                break;
 143        default:
 144                /*
 145                 * The case where Z and E are both sent in as '1' could be
 146                 * construed as a valid 'normal' case, but we decide not to,
 147                 * to avoid confusion
 148                 */
 149                dev_err_ratelimited(to_dev(arena),
 150                        "Invalid use of Z and E flags\n");
 151                return -EIO;
 152        }
 153
 154        mapping_le = cpu_to_le32(mapping);
 155        return __btt_map_write(arena, lba, mapping_le, rwb_flags);
 156}
 157
 158static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
 159                        int *trim, int *error, unsigned long rwb_flags)
 160{
 161        int ret;
 162        __le32 in;
 163        u32 raw_mapping, postmap, ze, z_flag, e_flag;
 164        u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
 165
 166        if (unlikely(lba >= arena->external_nlba))
 167                dev_err_ratelimited(to_dev(arena),
 168                        "%s: lba %#x out of range (max: %#x)\n",
 169                        __func__, lba, arena->external_nlba);
 170
 171        ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE, rwb_flags);
 172        if (ret)
 173                return ret;
 174
 175        raw_mapping = le32_to_cpu(in);
 176
 177        z_flag = ent_z_flag(raw_mapping);
 178        e_flag = ent_e_flag(raw_mapping);
 179        ze = (z_flag << 1) + e_flag;
 180        postmap = ent_lba(raw_mapping);
 181
 182        /* Reuse the {z,e}_flag variables for *trim and *error */
 183        z_flag = 0;
 184        e_flag = 0;
 185
 186        switch (ze) {
 187        case 0:
 188                /* Initial state. Return postmap = premap */
 189                *mapping = lba;
 190                break;
 191        case 1:
 192                *mapping = postmap;
 193                e_flag = 1;
 194                break;
 195        case 2:
 196                *mapping = postmap;
 197                z_flag = 1;
 198                break;
 199        case 3:
 200                *mapping = postmap;
 201                break;
 202        default:
 203                return -EIO;
 204        }
 205
 206        if (trim)
 207                *trim = z_flag;
 208        if (error)
 209                *error = e_flag;
 210
 211        return ret;
 212}
 213
 214static int btt_log_group_read(struct arena_info *arena, u32 lane,
 215                        struct log_group *log)
 216{
 217        return arena_read_bytes(arena,
 218                        arena->logoff + (lane * LOG_GRP_SIZE), log,
 219                        LOG_GRP_SIZE, 0);
 220}
 221
 222static struct dentry *debugfs_root;
 223
 224static void arena_debugfs_init(struct arena_info *a, struct dentry *parent,
 225                                int idx)
 226{
 227        char dirname[32];
 228        struct dentry *d;
 229
 230        /* If for some reason, parent bttN was not created, exit */
 231        if (!parent)
 232                return;
 233
 234        snprintf(dirname, 32, "arena%d", idx);
 235        d = debugfs_create_dir(dirname, parent);
 236        if (IS_ERR_OR_NULL(d))
 237                return;
 238        a->debugfs_dir = d;
 239
 240        debugfs_create_x64("size", S_IRUGO, d, &a->size);
 241        debugfs_create_x64("external_lba_start", S_IRUGO, d,
 242                                &a->external_lba_start);
 243        debugfs_create_x32("internal_nlba", S_IRUGO, d, &a->internal_nlba);
 244        debugfs_create_u32("internal_lbasize", S_IRUGO, d,
 245                                &a->internal_lbasize);
 246        debugfs_create_x32("external_nlba", S_IRUGO, d, &a->external_nlba);
 247        debugfs_create_u32("external_lbasize", S_IRUGO, d,
 248                                &a->external_lbasize);
 249        debugfs_create_u32("nfree", S_IRUGO, d, &a->nfree);
 250        debugfs_create_u16("version_major", S_IRUGO, d, &a->version_major);
 251        debugfs_create_u16("version_minor", S_IRUGO, d, &a->version_minor);
 252        debugfs_create_x64("nextoff", S_IRUGO, d, &a->nextoff);
 253        debugfs_create_x64("infooff", S_IRUGO, d, &a->infooff);
 254        debugfs_create_x64("dataoff", S_IRUGO, d, &a->dataoff);
 255        debugfs_create_x64("mapoff", S_IRUGO, d, &a->mapoff);
 256        debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff);
 257        debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off);
 258        debugfs_create_x32("flags", S_IRUGO, d, &a->flags);
 259        debugfs_create_u32("log_index_0", S_IRUGO, d, &a->log_index[0]);
 260        debugfs_create_u32("log_index_1", S_IRUGO, d, &a->log_index[1]);
 261}
 262
 263static void btt_debugfs_init(struct btt *btt)
 264{
 265        int i = 0;
 266        struct arena_info *arena;
 267
 268        btt->debugfs_dir = debugfs_create_dir(dev_name(&btt->nd_btt->dev),
 269                                                debugfs_root);
 270        if (IS_ERR_OR_NULL(btt->debugfs_dir))
 271                return;
 272
 273        list_for_each_entry(arena, &btt->arena_list, list) {
 274                arena_debugfs_init(arena, btt->debugfs_dir, i);
 275                i++;
 276        }
 277}
 278
 279static u32 log_seq(struct log_group *log, int log_idx)
 280{
 281        return le32_to_cpu(log->ent[log_idx].seq);
 282}
 283
 284/*
 285 * This function accepts two log entries, and uses the
 286 * sequence number to find the 'older' entry.
 287 * It also updates the sequence number in this old entry to
 288 * make it the 'new' one if the mark_flag is set.
 289 * Finally, it returns which of the entries was the older one.
 290 *
 291 * TODO The logic feels a bit kludge-y. make it better..
 292 */
 293static int btt_log_get_old(struct arena_info *a, struct log_group *log)
 294{
 295        int idx0 = a->log_index[0];
 296        int idx1 = a->log_index[1];
 297        int old;
 298
 299        /*
 300         * the first ever time this is seen, the entry goes into [0]
 301         * the next time, the following logic works out to put this
 302         * (next) entry into [1]
 303         */
 304        if (log_seq(log, idx0) == 0) {
 305                log->ent[idx0].seq = cpu_to_le32(1);
 306                return 0;
 307        }
 308
 309        if (log_seq(log, idx0) == log_seq(log, idx1))
 310                return -EINVAL;
 311        if (log_seq(log, idx0) + log_seq(log, idx1) > 5)
 312                return -EINVAL;
 313
 314        if (log_seq(log, idx0) < log_seq(log, idx1)) {
 315                if ((log_seq(log, idx1) - log_seq(log, idx0)) == 1)
 316                        old = 0;
 317                else
 318                        old = 1;
 319        } else {
 320                if ((log_seq(log, idx0) - log_seq(log, idx1)) == 1)
 321                        old = 1;
 322                else
 323                        old = 0;
 324        }
 325
 326        return old;
 327}
 328
 329/*
 330 * This function copies the desired (old/new) log entry into ent if
 331 * it is not NULL. It returns the sub-slot number (0 or 1)
 332 * where the desired log entry was found. Negative return values
 333 * indicate errors.
 334 */
 335static int btt_log_read(struct arena_info *arena, u32 lane,
 336                        struct log_entry *ent, int old_flag)
 337{
 338        int ret;
 339        int old_ent, ret_ent;
 340        struct log_group log;
 341
 342        ret = btt_log_group_read(arena, lane, &log);
 343        if (ret)
 344                return -EIO;
 345
 346        old_ent = btt_log_get_old(arena, &log);
 347        if (old_ent < 0 || old_ent > 1) {
 348                dev_err(to_dev(arena),
 349                                "log corruption (%d): lane %d seq [%d, %d]\n",
 350                                old_ent, lane, log.ent[arena->log_index[0]].seq,
 351                                log.ent[arena->log_index[1]].seq);
 352                /* TODO set error state? */
 353                return -EIO;
 354        }
 355
 356        ret_ent = (old_flag ? old_ent : (1 - old_ent));
 357
 358        if (ent != NULL)
 359                memcpy(ent, &log.ent[arena->log_index[ret_ent]], LOG_ENT_SIZE);
 360
 361        return ret_ent;
 362}
 363
 364/*
 365 * This function commits a log entry to media
 366 * It does _not_ prepare the freelist entry for the next write
 367 * btt_flog_write is the wrapper for updating the freelist elements
 368 */
 369static int __btt_log_write(struct arena_info *arena, u32 lane,
 370                        u32 sub, struct log_entry *ent, unsigned long flags)
 371{
 372        int ret;
 373        u32 group_slot = arena->log_index[sub];
 374        unsigned int log_half = LOG_ENT_SIZE / 2;
 375        void *src = ent;
 376        u64 ns_off;
 377
 378        ns_off = arena->logoff + (lane * LOG_GRP_SIZE) +
 379                (group_slot * LOG_ENT_SIZE);
 380        /* split the 16B write into atomic, durable halves */
 381        ret = arena_write_bytes(arena, ns_off, src, log_half, flags);
 382        if (ret)
 383                return ret;
 384
 385        ns_off += log_half;
 386        src += log_half;
 387        return arena_write_bytes(arena, ns_off, src, log_half, flags);
 388}
 389
 390static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub,
 391                        struct log_entry *ent)
 392{
 393        int ret;
 394
 395        ret = __btt_log_write(arena, lane, sub, ent, NVDIMM_IO_ATOMIC);
 396        if (ret)
 397                return ret;
 398
 399        /* prepare the next free entry */
 400        arena->freelist[lane].sub = 1 - arena->freelist[lane].sub;
 401        if (++(arena->freelist[lane].seq) == 4)
 402                arena->freelist[lane].seq = 1;
 403        if (ent_e_flag(ent->old_map))
 404                arena->freelist[lane].has_err = 1;
 405        arena->freelist[lane].block = le32_to_cpu(ent_lba(ent->old_map));
 406
 407        return ret;
 408}
 409
 410/*
 411 * This function initializes the BTT map to the initial state, which is
 412 * all-zeroes, and indicates an identity mapping
 413 */
 414static int btt_map_init(struct arena_info *arena)
 415{
 416        int ret = -EINVAL;
 417        void *zerobuf;
 418        size_t offset = 0;
 419        size_t chunk_size = SZ_2M;
 420        size_t mapsize = arena->logoff - arena->mapoff;
 421
 422        zerobuf = kzalloc(chunk_size, GFP_KERNEL);
 423        if (!zerobuf)
 424                return -ENOMEM;
 425
 426        /*
 427         * mapoff should always be at least 512B  aligned. We rely on that to
 428         * make sure rw_bytes does error clearing correctly, so make sure that
 429         * is the case.
 430         */
 431        dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->mapoff, 512),
 432                "arena->mapoff: %#llx is unaligned\n", arena->mapoff);
 433
 434        while (mapsize) {
 435                size_t size = min(mapsize, chunk_size);
 436
 437                dev_WARN_ONCE(to_dev(arena), size < 512,
 438                        "chunk size: %#zx is unaligned\n", size);
 439                ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf,
 440                                size, 0);
 441                if (ret)
 442                        goto free;
 443
 444                offset += size;
 445                mapsize -= size;
 446                cond_resched();
 447        }
 448
 449 free:
 450        kfree(zerobuf);
 451        return ret;
 452}
 453
 454/*
 455 * This function initializes the BTT log with 'fake' entries pointing
 456 * to the initial reserved set of blocks as being free
 457 */
 458static int btt_log_init(struct arena_info *arena)
 459{
 460        size_t logsize = arena->info2off - arena->logoff;
 461        size_t chunk_size = SZ_4K, offset = 0;
 462        struct log_entry ent;
 463        void *zerobuf;
 464        int ret;
 465        u32 i;
 466
 467        zerobuf = kzalloc(chunk_size, GFP_KERNEL);
 468        if (!zerobuf)
 469                return -ENOMEM;
 470        /*
 471         * logoff should always be at least 512B  aligned. We rely on that to
 472         * make sure rw_bytes does error clearing correctly, so make sure that
 473         * is the case.
 474         */
 475        dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->logoff, 512),
 476                "arena->logoff: %#llx is unaligned\n", arena->logoff);
 477
 478        while (logsize) {
 479                size_t size = min(logsize, chunk_size);
 480
 481                dev_WARN_ONCE(to_dev(arena), size < 512,
 482                        "chunk size: %#zx is unaligned\n", size);
 483                ret = arena_write_bytes(arena, arena->logoff + offset, zerobuf,
 484                                size, 0);
 485                if (ret)
 486                        goto free;
 487
 488                offset += size;
 489                logsize -= size;
 490                cond_resched();
 491        }
 492
 493        for (i = 0; i < arena->nfree; i++) {
 494                ent.lba = cpu_to_le32(i);
 495                ent.old_map = cpu_to_le32(arena->external_nlba + i);
 496                ent.new_map = cpu_to_le32(arena->external_nlba + i);
 497                ent.seq = cpu_to_le32(LOG_SEQ_INIT);
 498                ret = __btt_log_write(arena, i, 0, &ent, 0);
 499                if (ret)
 500                        goto free;
 501        }
 502
 503 free:
 504        kfree(zerobuf);
 505        return ret;
 506}
 507
 508static u64 to_namespace_offset(struct arena_info *arena, u64 lba)
 509{
 510        return arena->dataoff + ((u64)lba * arena->internal_lbasize);
 511}
 512
 513static int arena_clear_freelist_error(struct arena_info *arena, u32 lane)
 514{
 515        int ret = 0;
 516
 517        if (arena->freelist[lane].has_err) {
 518                void *zero_page = page_address(ZERO_PAGE(0));
 519                u32 lba = arena->freelist[lane].block;
 520                u64 nsoff = to_namespace_offset(arena, lba);
 521                unsigned long len = arena->sector_size;
 522
 523                mutex_lock(&arena->err_lock);
 524
 525                while (len) {
 526                        unsigned long chunk = min(len, PAGE_SIZE);
 527
 528                        ret = arena_write_bytes(arena, nsoff, zero_page,
 529                                chunk, 0);
 530                        if (ret)
 531                                break;
 532                        len -= chunk;
 533                        nsoff += chunk;
 534                        if (len == 0)
 535                                arena->freelist[lane].has_err = 0;
 536                }
 537                mutex_unlock(&arena->err_lock);
 538        }
 539        return ret;
 540}
 541
 542static int btt_freelist_init(struct arena_info *arena)
 543{
 544        int old, new, ret;
 545        u32 i, map_entry;
 546        struct log_entry log_new, log_old;
 547
 548        arena->freelist = kcalloc(arena->nfree, sizeof(struct free_entry),
 549                                        GFP_KERNEL);
 550        if (!arena->freelist)
 551                return -ENOMEM;
 552
 553        for (i = 0; i < arena->nfree; i++) {
 554                old = btt_log_read(arena, i, &log_old, LOG_OLD_ENT);
 555                if (old < 0)
 556                        return old;
 557
 558                new = btt_log_read(arena, i, &log_new, LOG_NEW_ENT);
 559                if (new < 0)
 560                        return new;
 561
 562                /* sub points to the next one to be overwritten */
 563                arena->freelist[i].sub = 1 - new;
 564                arena->freelist[i].seq = nd_inc_seq(le32_to_cpu(log_new.seq));
 565                arena->freelist[i].block = le32_to_cpu(log_new.old_map);
 566
 567                /*
 568                 * FIXME: if error clearing fails during init, we want to make
 569                 * the BTT read-only
 570                 */
 571                if (ent_e_flag(log_new.old_map)) {
 572                        ret = arena_clear_freelist_error(arena, i);
 573                        if (ret)
 574                                dev_err_ratelimited(to_dev(arena),
 575                                        "Unable to clear known errors\n");
 576                }
 577
 578                /* This implies a newly created or untouched flog entry */
 579                if (log_new.old_map == log_new.new_map)
 580                        continue;
 581
 582                /* Check if map recovery is needed */
 583                ret = btt_map_read(arena, le32_to_cpu(log_new.lba), &map_entry,
 584                                NULL, NULL, 0);
 585                if (ret)
 586                        return ret;
 587                if ((le32_to_cpu(log_new.new_map) != map_entry) &&
 588                                (le32_to_cpu(log_new.old_map) == map_entry)) {
 589                        /*
 590                         * Last transaction wrote the flog, but wasn't able
 591                         * to complete the map write. So fix up the map.
 592                         */
 593                        ret = btt_map_write(arena, le32_to_cpu(log_new.lba),
 594                                        le32_to_cpu(log_new.new_map), 0, 0, 0);
 595                        if (ret)
 596                                return ret;
 597                }
 598        }
 599
 600        return 0;
 601}
 602
 603static bool ent_is_padding(struct log_entry *ent)
 604{
 605        return (ent->lba == 0) && (ent->old_map == 0) && (ent->new_map == 0)
 606                && (ent->seq == 0);
 607}
 608
 609/*
 610 * Detecting valid log indices: We read a log group (see the comments in btt.h
 611 * for a description of a 'log_group' and its 'slots'), and iterate over its
 612 * four slots. We expect that a padding slot will be all-zeroes, and use this
 613 * to detect a padding slot vs. an actual entry.
 614 *
 615 * If a log_group is in the initial state, i.e. hasn't been used since the
 616 * creation of this BTT layout, it will have three of the four slots with
 617 * zeroes. We skip over these log_groups for the detection of log_index. If
 618 * all log_groups are in the initial state (i.e. the BTT has never been
 619 * written to), it is safe to assume the 'new format' of log entries in slots
 620 * (0, 1).
 621 */
 622static int log_set_indices(struct arena_info *arena)
 623{
 624        bool idx_set = false, initial_state = true;
 625        int ret, log_index[2] = {-1, -1};
 626        u32 i, j, next_idx = 0;
 627        struct log_group log;
 628        u32 pad_count = 0;
 629
 630        for (i = 0; i < arena->nfree; i++) {
 631                ret = btt_log_group_read(arena, i, &log);
 632                if (ret < 0)
 633                        return ret;
 634
 635                for (j = 0; j < 4; j++) {
 636                        if (!idx_set) {
 637                                if (ent_is_padding(&log.ent[j])) {
 638                                        pad_count++;
 639                                        continue;
 640                                } else {
 641                                        /* Skip if index has been recorded */
 642                                        if ((next_idx == 1) &&
 643                                                (j == log_index[0]))
 644                                                continue;
 645                                        /* valid entry, record index */
 646                                        log_index[next_idx] = j;
 647                                        next_idx++;
 648                                }
 649                                if (next_idx == 2) {
 650                                        /* two valid entries found */
 651                                        idx_set = true;
 652                                } else if (next_idx > 2) {
 653                                        /* too many valid indices */
 654                                        return -ENXIO;
 655                                }
 656                        } else {
 657                                /*
 658                                 * once the indices have been set, just verify
 659                                 * that all subsequent log groups are either in
 660                                 * their initial state or follow the same
 661                                 * indices.
 662                                 */
 663                                if (j == log_index[0]) {
 664                                        /* entry must be 'valid' */
 665                                        if (ent_is_padding(&log.ent[j]))
 666                                                return -ENXIO;
 667                                } else if (j == log_index[1]) {
 668                                        ;
 669                                        /*
 670                                         * log_index[1] can be padding if the
 671                                         * lane never got used and it is still
 672                                         * in the initial state (three 'padding'
 673                                         * entries)
 674                                         */
 675                                } else {
 676                                        /* entry must be invalid (padding) */
 677                                        if (!ent_is_padding(&log.ent[j]))
 678                                                return -ENXIO;
 679                                }
 680                        }
 681                }
 682                /*
 683                 * If any of the log_groups have more than one valid,
 684                 * non-padding entry, then the we are no longer in the
 685                 * initial_state
 686                 */
 687                if (pad_count < 3)
 688                        initial_state = false;
 689                pad_count = 0;
 690        }
 691
 692        if (!initial_state && !idx_set)
 693                return -ENXIO;
 694
 695        /*
 696         * If all the entries in the log were in the initial state,
 697         * assume new padding scheme
 698         */
 699        if (initial_state)
 700                log_index[1] = 1;
 701
 702        /*
 703         * Only allow the known permutations of log/padding indices,
 704         * i.e. (0, 1), and (0, 2)
 705         */
 706        if ((log_index[0] == 0) && ((log_index[1] == 1) || (log_index[1] == 2)))
 707                ; /* known index possibilities */
 708        else {
 709                dev_err(to_dev(arena), "Found an unknown padding scheme\n");
 710                return -ENXIO;
 711        }
 712
 713        arena->log_index[0] = log_index[0];
 714        arena->log_index[1] = log_index[1];
 715        dev_dbg(to_dev(arena), "log_index_0 = %d\n", log_index[0]);
 716        dev_dbg(to_dev(arena), "log_index_1 = %d\n", log_index[1]);
 717        return 0;
 718}
 719
 720static int btt_rtt_init(struct arena_info *arena)
 721{
 722        arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL);
 723        if (arena->rtt == NULL)
 724                return -ENOMEM;
 725
 726        return 0;
 727}
 728
 729static int btt_maplocks_init(struct arena_info *arena)
 730{
 731        u32 i;
 732
 733        arena->map_locks = kcalloc(arena->nfree, sizeof(struct aligned_lock),
 734                                GFP_KERNEL);
 735        if (!arena->map_locks)
 736                return -ENOMEM;
 737
 738        for (i = 0; i < arena->nfree; i++)
 739                spin_lock_init(&arena->map_locks[i].lock);
 740
 741        return 0;
 742}
 743
 744static struct arena_info *alloc_arena(struct btt *btt, size_t size,
 745                                size_t start, size_t arena_off)
 746{
 747        struct arena_info *arena;
 748        u64 logsize, mapsize, datasize;
 749        u64 available = size;
 750
 751        arena = kzalloc(sizeof(struct arena_info), GFP_KERNEL);
 752        if (!arena)
 753                return NULL;
 754        arena->nd_btt = btt->nd_btt;
 755        arena->sector_size = btt->sector_size;
 756        mutex_init(&arena->err_lock);
 757
 758        if (!size)
 759                return arena;
 760
 761        arena->size = size;
 762        arena->external_lba_start = start;
 763        arena->external_lbasize = btt->lbasize;
 764        arena->internal_lbasize = roundup(arena->external_lbasize,
 765                                        INT_LBASIZE_ALIGNMENT);
 766        arena->nfree = BTT_DEFAULT_NFREE;
 767        arena->version_major = btt->nd_btt->version_major;
 768        arena->version_minor = btt->nd_btt->version_minor;
 769
 770        if (available % BTT_PG_SIZE)
 771                available -= (available % BTT_PG_SIZE);
 772
 773        /* Two pages are reserved for the super block and its copy */
 774        available -= 2 * BTT_PG_SIZE;
 775
 776        /* The log takes a fixed amount of space based on nfree */
 777        logsize = roundup(arena->nfree * LOG_GRP_SIZE, BTT_PG_SIZE);
 778        available -= logsize;
 779
 780        /* Calculate optimal split between map and data area */
 781        arena->internal_nlba = div_u64(available - BTT_PG_SIZE,
 782                        arena->internal_lbasize + MAP_ENT_SIZE);
 783        arena->external_nlba = arena->internal_nlba - arena->nfree;
 784
 785        mapsize = roundup((arena->external_nlba * MAP_ENT_SIZE), BTT_PG_SIZE);
 786        datasize = available - mapsize;
 787
 788        /* 'Absolute' values, relative to start of storage space */
 789        arena->infooff = arena_off;
 790        arena->dataoff = arena->infooff + BTT_PG_SIZE;
 791        arena->mapoff = arena->dataoff + datasize;
 792        arena->logoff = arena->mapoff + mapsize;
 793        arena->info2off = arena->logoff + logsize;
 794
 795        /* Default log indices are (0,1) */
 796        arena->log_index[0] = 0;
 797        arena->log_index[1] = 1;
 798        return arena;
 799}
 800
 801static void free_arenas(struct btt *btt)
 802{
 803        struct arena_info *arena, *next;
 804
 805        list_for_each_entry_safe(arena, next, &btt->arena_list, list) {
 806                list_del(&arena->list);
 807                kfree(arena->rtt);
 808                kfree(arena->map_locks);
 809                kfree(arena->freelist);
 810                debugfs_remove_recursive(arena->debugfs_dir);
 811                kfree(arena);
 812        }
 813}
 814
 815/*
 816 * This function reads an existing valid btt superblock and
 817 * populates the corresponding arena_info struct
 818 */
 819static void parse_arena_meta(struct arena_info *arena, struct btt_sb *super,
 820                                u64 arena_off)
 821{
 822        arena->internal_nlba = le32_to_cpu(super->internal_nlba);
 823        arena->internal_lbasize = le32_to_cpu(super->internal_lbasize);
 824        arena->external_nlba = le32_to_cpu(super->external_nlba);
 825        arena->external_lbasize = le32_to_cpu(super->external_lbasize);
 826        arena->nfree = le32_to_cpu(super->nfree);
 827        arena->version_major = le16_to_cpu(super->version_major);
 828        arena->version_minor = le16_to_cpu(super->version_minor);
 829
 830        arena->nextoff = (super->nextoff == 0) ? 0 : (arena_off +
 831                        le64_to_cpu(super->nextoff));
 832        arena->infooff = arena_off;
 833        arena->dataoff = arena_off + le64_to_cpu(super->dataoff);
 834        arena->mapoff = arena_off + le64_to_cpu(super->mapoff);
 835        arena->logoff = arena_off + le64_to_cpu(super->logoff);
 836        arena->info2off = arena_off + le64_to_cpu(super->info2off);
 837
 838        arena->size = (le64_to_cpu(super->nextoff) > 0)
 839                ? (le64_to_cpu(super->nextoff))
 840                : (arena->info2off - arena->infooff + BTT_PG_SIZE);
 841
 842        arena->flags = le32_to_cpu(super->flags);
 843}
 844
 845static int discover_arenas(struct btt *btt)
 846{
 847        int ret = 0;
 848        struct arena_info *arena;
 849        struct btt_sb *super;
 850        size_t remaining = btt->rawsize;
 851        u64 cur_nlba = 0;
 852        size_t cur_off = 0;
 853        int num_arenas = 0;
 854
 855        super = kzalloc(sizeof(*super), GFP_KERNEL);
 856        if (!super)
 857                return -ENOMEM;
 858
 859        while (remaining) {
 860                /* Alloc memory for arena */
 861                arena = alloc_arena(btt, 0, 0, 0);
 862                if (!arena) {
 863                        ret = -ENOMEM;
 864                        goto out_super;
 865                }
 866
 867                arena->infooff = cur_off;
 868                ret = btt_info_read(arena, super);
 869                if (ret)
 870                        goto out;
 871
 872                if (!nd_btt_arena_is_valid(btt->nd_btt, super)) {
 873                        if (remaining == btt->rawsize) {
 874                                btt->init_state = INIT_NOTFOUND;
 875                                dev_info(to_dev(arena), "No existing arenas\n");
 876                                goto out;
 877                        } else {
 878                                dev_err(to_dev(arena),
 879                                                "Found corrupted metadata!\n");
 880                                ret = -ENODEV;
 881                                goto out;
 882                        }
 883                }
 884
 885                arena->external_lba_start = cur_nlba;
 886                parse_arena_meta(arena, super, cur_off);
 887
 888                ret = log_set_indices(arena);
 889                if (ret) {
 890                        dev_err(to_dev(arena),
 891                                "Unable to deduce log/padding indices\n");
 892                        goto out;
 893                }
 894
 895                ret = btt_freelist_init(arena);
 896                if (ret)
 897                        goto out;
 898
 899                ret = btt_rtt_init(arena);
 900                if (ret)
 901                        goto out;
 902
 903                ret = btt_maplocks_init(arena);
 904                if (ret)
 905                        goto out;
 906
 907                list_add_tail(&arena->list, &btt->arena_list);
 908
 909                remaining -= arena->size;
 910                cur_off += arena->size;
 911                cur_nlba += arena->external_nlba;
 912                num_arenas++;
 913
 914                if (arena->nextoff == 0)
 915                        break;
 916        }
 917        btt->num_arenas = num_arenas;
 918        btt->nlba = cur_nlba;
 919        btt->init_state = INIT_READY;
 920
 921        kfree(super);
 922        return ret;
 923
 924 out:
 925        kfree(arena);
 926        free_arenas(btt);
 927 out_super:
 928        kfree(super);
 929        return ret;
 930}
 931
 932static int create_arenas(struct btt *btt)
 933{
 934        size_t remaining = btt->rawsize;
 935        size_t cur_off = 0;
 936
 937        while (remaining) {
 938                struct arena_info *arena;
 939                size_t arena_size = min_t(u64, ARENA_MAX_SIZE, remaining);
 940
 941                remaining -= arena_size;
 942                if (arena_size < ARENA_MIN_SIZE)
 943                        break;
 944
 945                arena = alloc_arena(btt, arena_size, btt->nlba, cur_off);
 946                if (!arena) {
 947                        free_arenas(btt);
 948                        return -ENOMEM;
 949                }
 950                btt->nlba += arena->external_nlba;
 951                if (remaining >= ARENA_MIN_SIZE)
 952                        arena->nextoff = arena->size;
 953                else
 954                        arena->nextoff = 0;
 955                cur_off += arena_size;
 956                list_add_tail(&arena->list, &btt->arena_list);
 957        }
 958
 959        return 0;
 960}
 961
 962/*
 963 * This function completes arena initialization by writing
 964 * all the metadata.
 965 * It is only called for an uninitialized arena when a write
 966 * to that arena occurs for the first time.
 967 */
 968static int btt_arena_write_layout(struct arena_info *arena)
 969{
 970        int ret;
 971        u64 sum;
 972        struct btt_sb *super;
 973        struct nd_btt *nd_btt = arena->nd_btt;
 974        const u8 *parent_uuid = nd_dev_to_uuid(&nd_btt->ndns->dev);
 975
 976        ret = btt_map_init(arena);
 977        if (ret)
 978                return ret;
 979
 980        ret = btt_log_init(arena);
 981        if (ret)
 982                return ret;
 983
 984        super = kzalloc(sizeof(struct btt_sb), GFP_NOIO);
 985        if (!super)
 986                return -ENOMEM;
 987
 988        strncpy(super->signature, BTT_SIG, BTT_SIG_LEN);
 989        memcpy(super->uuid, nd_btt->uuid, 16);
 990        memcpy(super->parent_uuid, parent_uuid, 16);
 991        super->flags = cpu_to_le32(arena->flags);
 992        super->version_major = cpu_to_le16(arena->version_major);
 993        super->version_minor = cpu_to_le16(arena->version_minor);
 994        super->external_lbasize = cpu_to_le32(arena->external_lbasize);
 995        super->external_nlba = cpu_to_le32(arena->external_nlba);
 996        super->internal_lbasize = cpu_to_le32(arena->internal_lbasize);
 997        super->internal_nlba = cpu_to_le32(arena->internal_nlba);
 998        super->nfree = cpu_to_le32(arena->nfree);
 999        super->infosize = cpu_to_le32(sizeof(struct btt_sb));
1000        super->nextoff = cpu_to_le64(arena->nextoff);
1001        /*
1002         * Subtract arena->infooff (arena start) so numbers are relative
1003         * to 'this' arena
1004         */
1005        super->dataoff = cpu_to_le64(arena->dataoff - arena->infooff);
1006        super->mapoff = cpu_to_le64(arena->mapoff - arena->infooff);
1007        super->logoff = cpu_to_le64(arena->logoff - arena->infooff);
1008        super->info2off = cpu_to_le64(arena->info2off - arena->infooff);
1009
1010        super->flags = 0;
1011        sum = nd_sb_checksum((struct nd_gen_sb *) super);
1012        super->checksum = cpu_to_le64(sum);
1013
1014        ret = btt_info_write(arena, super);
1015
1016        kfree(super);
1017        return ret;
1018}
1019
1020/*
1021 * This function completes the initialization for the BTT namespace
1022 * such that it is ready to accept IOs
1023 */
1024static int btt_meta_init(struct btt *btt)
1025{
1026        int ret = 0;
1027        struct arena_info *arena;
1028
1029        mutex_lock(&btt->init_lock);
1030        list_for_each_entry(arena, &btt->arena_list, list) {
1031                ret = btt_arena_write_layout(arena);
1032                if (ret)
1033                        goto unlock;
1034
1035                ret = btt_freelist_init(arena);
1036                if (ret)
1037                        goto unlock;
1038
1039                ret = btt_rtt_init(arena);
1040                if (ret)
1041                        goto unlock;
1042
1043                ret = btt_maplocks_init(arena);
1044                if (ret)
1045                        goto unlock;
1046        }
1047
1048        btt->init_state = INIT_READY;
1049
1050 unlock:
1051        mutex_unlock(&btt->init_lock);
1052        return ret;
1053}
1054
1055static u32 btt_meta_size(struct btt *btt)
1056{
1057        return btt->lbasize - btt->sector_size;
1058}
1059
1060/*
1061 * This function calculates the arena in which the given LBA lies
1062 * by doing a linear walk. This is acceptable since we expect only
1063 * a few arenas. If we have backing devices that get much larger,
1064 * we can construct a balanced binary tree of arenas at init time
1065 * so that this range search becomes faster.
1066 */
1067static int lba_to_arena(struct btt *btt, sector_t sector, __u32 *premap,
1068                                struct arena_info **arena)
1069{
1070        struct arena_info *arena_list;
1071        __u64 lba = div_u64(sector << SECTOR_SHIFT, btt->sector_size);
1072
1073        list_for_each_entry(arena_list, &btt->arena_list, list) {
1074                if (lba < arena_list->external_nlba) {
1075                        *arena = arena_list;
1076                        *premap = lba;
1077                        return 0;
1078                }
1079                lba -= arena_list->external_nlba;
1080        }
1081
1082        return -EIO;
1083}
1084
1085/*
1086 * The following (lock_map, unlock_map) are mostly just to improve
1087 * readability, since they index into an array of locks
1088 */
1089static void lock_map(struct arena_info *arena, u32 premap)
1090                __acquires(&arena->map_locks[idx].lock)
1091{
1092        u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree;
1093
1094        spin_lock(&arena->map_locks[idx].lock);
1095}
1096
1097static void unlock_map(struct arena_info *arena, u32 premap)
1098                __releases(&arena->map_locks[idx].lock)
1099{
1100        u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree;
1101
1102        spin_unlock(&arena->map_locks[idx].lock);
1103}
1104
1105static int btt_data_read(struct arena_info *arena, struct page *page,
1106                        unsigned int off, u32 lba, u32 len)
1107{
1108        int ret;
1109        u64 nsoff = to_namespace_offset(arena, lba);
1110        void *mem = kmap_atomic(page);
1111
1112        ret = arena_read_bytes(arena, nsoff, mem + off, len, NVDIMM_IO_ATOMIC);
1113        kunmap_atomic(mem);
1114
1115        return ret;
1116}
1117
1118static int btt_data_write(struct arena_info *arena, u32 lba,
1119                        struct page *page, unsigned int off, u32 len)
1120{
1121        int ret;
1122        u64 nsoff = to_namespace_offset(arena, lba);
1123        void *mem = kmap_atomic(page);
1124
1125        ret = arena_write_bytes(arena, nsoff, mem + off, len, NVDIMM_IO_ATOMIC);
1126        kunmap_atomic(mem);
1127
1128        return ret;
1129}
1130
1131static void zero_fill_data(struct page *page, unsigned int off, u32 len)
1132{
1133        void *mem = kmap_atomic(page);
1134
1135        memset(mem + off, 0, len);
1136        kunmap_atomic(mem);
1137}
1138
1139#ifdef CONFIG_BLK_DEV_INTEGRITY
1140static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip,
1141                        struct arena_info *arena, u32 postmap, int rw)
1142{
1143        unsigned int len = btt_meta_size(btt);
1144        u64 meta_nsoff;
1145        int ret = 0;
1146
1147        if (bip == NULL)
1148                return 0;
1149
1150        meta_nsoff = to_namespace_offset(arena, postmap) + btt->sector_size;
1151
1152        while (len) {
1153                unsigned int cur_len;
1154                struct bio_vec bv;
1155                void *mem;
1156
1157                bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
1158                /*
1159                 * The 'bv' obtained from bvec_iter_bvec has its .bv_len and
1160                 * .bv_offset already adjusted for iter->bi_bvec_done, and we
1161                 * can use those directly
1162                 */
1163
1164                cur_len = min(len, bv.bv_len);
1165                mem = kmap_atomic(bv.bv_page);
1166                if (rw)
1167                        ret = arena_write_bytes(arena, meta_nsoff,
1168                                        mem + bv.bv_offset, cur_len,
1169                                        NVDIMM_IO_ATOMIC);
1170                else
1171                        ret = arena_read_bytes(arena, meta_nsoff,
1172                                        mem + bv.bv_offset, cur_len,
1173                                        NVDIMM_IO_ATOMIC);
1174
1175                kunmap_atomic(mem);
1176                if (ret)
1177                        return ret;
1178
1179                len -= cur_len;
1180                meta_nsoff += cur_len;
1181                if (!bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len))
1182                        return -EIO;
1183        }
1184
1185        return ret;
1186}
1187
1188#else /* CONFIG_BLK_DEV_INTEGRITY */
1189static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip,
1190                        struct arena_info *arena, u32 postmap, int rw)
1191{
1192        return 0;
1193}
1194#endif
1195
1196static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip,
1197                        struct page *page, unsigned int off, sector_t sector,
1198                        unsigned int len)
1199{
1200        int ret = 0;
1201        int t_flag, e_flag;
1202        struct arena_info *arena = NULL;
1203        u32 lane = 0, premap, postmap;
1204
1205        while (len) {
1206                u32 cur_len;
1207
1208                lane = nd_region_acquire_lane(btt->nd_region);
1209
1210                ret = lba_to_arena(btt, sector, &premap, &arena);
1211                if (ret)
1212                        goto out_lane;
1213
1214                cur_len = min(btt->sector_size, len);
1215
1216                ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag,
1217                                NVDIMM_IO_ATOMIC);
1218                if (ret)
1219                        goto out_lane;
1220
1221                /*
1222                 * We loop to make sure that the post map LBA didn't change
1223                 * from under us between writing the RTT and doing the actual
1224                 * read.
1225                 */
1226                while (1) {
1227                        u32 new_map;
1228                        int new_t, new_e;
1229
1230                        if (t_flag) {
1231                                zero_fill_data(page, off, cur_len);
1232                                goto out_lane;
1233                        }
1234
1235                        if (e_flag) {
1236                                ret = -EIO;
1237                                goto out_lane;
1238                        }
1239
1240                        arena->rtt[lane] = RTT_VALID | postmap;
1241                        /*
1242                         * Barrier to make sure this write is not reordered
1243                         * to do the verification map_read before the RTT store
1244                         */
1245                        barrier();
1246
1247                        ret = btt_map_read(arena, premap, &new_map, &new_t,
1248                                                &new_e, NVDIMM_IO_ATOMIC);
1249                        if (ret)
1250                                goto out_rtt;
1251
1252                        if ((postmap == new_map) && (t_flag == new_t) &&
1253                                        (e_flag == new_e))
1254                                break;
1255
1256                        postmap = new_map;
1257                        t_flag = new_t;
1258                        e_flag = new_e;
1259                }
1260
1261                ret = btt_data_read(arena, page, off, postmap, cur_len);
1262                if (ret) {
1263                        int rc;
1264
1265                        /* Media error - set the e_flag */
1266                        rc = btt_map_write(arena, premap, postmap, 0, 1,
1267                                NVDIMM_IO_ATOMIC);
1268                        goto out_rtt;
1269                }
1270
1271                if (bip) {
1272                        ret = btt_rw_integrity(btt, bip, arena, postmap, READ);
1273                        if (ret)
1274                                goto out_rtt;
1275                }
1276
1277                arena->rtt[lane] = RTT_INVALID;
1278                nd_region_release_lane(btt->nd_region, lane);
1279
1280                len -= cur_len;
1281                off += cur_len;
1282                sector += btt->sector_size >> SECTOR_SHIFT;
1283        }
1284
1285        return 0;
1286
1287 out_rtt:
1288        arena->rtt[lane] = RTT_INVALID;
1289 out_lane:
1290        nd_region_release_lane(btt->nd_region, lane);
1291        return ret;
1292}
1293
1294/*
1295 * Normally, arena_{read,write}_bytes will take care of the initial offset
1296 * adjustment, but in the case of btt_is_badblock, where we query is_bad_pmem,
1297 * we need the final, raw namespace offset here
1298 */
1299static bool btt_is_badblock(struct btt *btt, struct arena_info *arena,
1300                u32 postmap)
1301{
1302        u64 nsoff = adjust_initial_offset(arena->nd_btt,
1303                        to_namespace_offset(arena, postmap));
1304        sector_t phys_sector = nsoff >> 9;
1305
1306        return is_bad_pmem(btt->phys_bb, phys_sector, arena->internal_lbasize);
1307}
1308
1309static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
1310                        sector_t sector, struct page *page, unsigned int off,
1311                        unsigned int len)
1312{
1313        int ret = 0;
1314        struct arena_info *arena = NULL;
1315        u32 premap = 0, old_postmap, new_postmap, lane = 0, i;
1316        struct log_entry log;
1317        int sub;
1318
1319        while (len) {
1320                u32 cur_len;
1321                int e_flag;
1322
1323 retry:
1324                lane = nd_region_acquire_lane(btt->nd_region);
1325
1326                ret = lba_to_arena(btt, sector, &premap, &arena);
1327                if (ret)
1328                        goto out_lane;
1329                cur_len = min(btt->sector_size, len);
1330
1331                if ((arena->flags & IB_FLAG_ERROR_MASK) != 0) {
1332                        ret = -EIO;
1333                        goto out_lane;
1334                }
1335
1336                if (btt_is_badblock(btt, arena, arena->freelist[lane].block))
1337                        arena->freelist[lane].has_err = 1;
1338
1339                if (mutex_is_locked(&arena->err_lock)
1340                                || arena->freelist[lane].has_err) {
1341                        nd_region_release_lane(btt->nd_region, lane);
1342
1343                        ret = arena_clear_freelist_error(arena, lane);
1344                        if (ret)
1345                                return ret;
1346
1347                        /* OK to acquire a different lane/free block */
1348                        goto retry;
1349                }
1350
1351                new_postmap = arena->freelist[lane].block;
1352
1353                /* Wait if the new block is being read from */
1354                for (i = 0; i < arena->nfree; i++)
1355                        while (arena->rtt[i] == (RTT_VALID | new_postmap))
1356                                cpu_relax();
1357
1358
1359                if (new_postmap >= arena->internal_nlba) {
1360                        ret = -EIO;
1361                        goto out_lane;
1362                }
1363
1364                ret = btt_data_write(arena, new_postmap, page, off, cur_len);
1365                if (ret)
1366                        goto out_lane;
1367
1368                if (bip) {
1369                        ret = btt_rw_integrity(btt, bip, arena, new_postmap,
1370                                                WRITE);
1371                        if (ret)
1372                                goto out_lane;
1373                }
1374
1375                lock_map(arena, premap);
1376                ret = btt_map_read(arena, premap, &old_postmap, NULL, &e_flag,
1377                                NVDIMM_IO_ATOMIC);
1378                if (ret)
1379                        goto out_map;
1380                if (old_postmap >= arena->internal_nlba) {
1381                        ret = -EIO;
1382                        goto out_map;
1383                }
1384                if (e_flag)
1385                        set_e_flag(old_postmap);
1386
1387                log.lba = cpu_to_le32(premap);
1388                log.old_map = cpu_to_le32(old_postmap);
1389                log.new_map = cpu_to_le32(new_postmap);
1390                log.seq = cpu_to_le32(arena->freelist[lane].seq);
1391                sub = arena->freelist[lane].sub;
1392                ret = btt_flog_write(arena, lane, sub, &log);
1393                if (ret)
1394                        goto out_map;
1395
1396                ret = btt_map_write(arena, premap, new_postmap, 0, 0,
1397                        NVDIMM_IO_ATOMIC);
1398                if (ret)
1399                        goto out_map;
1400
1401                unlock_map(arena, premap);
1402                nd_region_release_lane(btt->nd_region, lane);
1403
1404                if (e_flag) {
1405                        ret = arena_clear_freelist_error(arena, lane);
1406                        if (ret)
1407                                return ret;
1408                }
1409
1410                len -= cur_len;
1411                off += cur_len;
1412                sector += btt->sector_size >> SECTOR_SHIFT;
1413        }
1414
1415        return 0;
1416
1417 out_map:
1418        unlock_map(arena, premap);
1419 out_lane:
1420        nd_region_release_lane(btt->nd_region, lane);
1421        return ret;
1422}
1423
1424static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip,
1425                        struct page *page, unsigned int len, unsigned int off,
1426                        bool is_write, sector_t sector)
1427{
1428        int ret;
1429
1430        if (!is_write) {
1431                ret = btt_read_pg(btt, bip, page, off, sector, len);
1432                flush_dcache_page(page);
1433        } else {
1434                flush_dcache_page(page);
1435                ret = btt_write_pg(btt, bip, sector, page, off, len);
1436        }
1437
1438        return ret;
1439}
1440
1441static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
1442{
1443        struct bio_integrity_payload *bip = bio_integrity(bio);
1444        struct btt *btt = q->queuedata;
1445        struct bvec_iter iter;
1446        unsigned long start;
1447        struct bio_vec bvec;
1448        int err = 0;
1449        bool do_acct;
1450
1451        if (!bio_integrity_prep(bio))
1452                return BLK_QC_T_NONE;
1453
1454        do_acct = nd_iostat_start(bio, &start);
1455        bio_for_each_segment(bvec, bio, iter) {
1456                unsigned int len = bvec.bv_len;
1457
1458                if (len > PAGE_SIZE || len < btt->sector_size ||
1459                                len % btt->sector_size) {
1460                        dev_err_ratelimited(&btt->nd_btt->dev,
1461                                "unaligned bio segment (len: %d)\n", len);
1462                        bio->bi_status = BLK_STS_IOERR;
1463                        break;
1464                }
1465
1466                err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset,
1467                                  op_is_write(bio_op(bio)), iter.bi_sector);
1468                if (err) {
1469                        dev_err(&btt->nd_btt->dev,
1470                                        "io error in %s sector %lld, len %d,\n",
1471                                        (op_is_write(bio_op(bio))) ? "WRITE" :
1472                                        "READ",
1473                                        (unsigned long long) iter.bi_sector, len);
1474                        bio->bi_status = errno_to_blk_status(err);
1475                        break;
1476                }
1477        }
1478        if (do_acct)
1479                nd_iostat_end(bio, start);
1480
1481        bio_endio(bio);
1482        return BLK_QC_T_NONE;
1483}
1484
1485static int btt_rw_page(struct block_device *bdev, sector_t sector,
1486                struct page *page, bool is_write)
1487{
1488        struct btt *btt = bdev->bd_disk->private_data;
1489        int rc;
1490        unsigned int len;
1491
1492        len = hpage_nr_pages(page) * PAGE_SIZE;
1493        rc = btt_do_bvec(btt, NULL, page, len, 0, is_write, sector);
1494        if (rc == 0)
1495                page_endio(page, is_write, 0);
1496
1497        return rc;
1498}
1499
1500
1501static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo)
1502{
1503        /* some standard values */
1504        geo->heads = 1 << 6;
1505        geo->sectors = 1 << 5;
1506        geo->cylinders = get_capacity(bd->bd_disk) >> 11;
1507        return 0;
1508}
1509
1510static const struct block_device_operations btt_fops = {
1511        .owner =                THIS_MODULE,
1512        .rw_page =              btt_rw_page,
1513        .getgeo =               btt_getgeo,
1514        .revalidate_disk =      nvdimm_revalidate_disk,
1515};
1516
1517static int btt_blk_init(struct btt *btt)
1518{
1519        struct nd_btt *nd_btt = btt->nd_btt;
1520        struct nd_namespace_common *ndns = nd_btt->ndns;
1521
1522        /* create a new disk and request queue for btt */
1523        btt->btt_queue = blk_alloc_queue(GFP_KERNEL);
1524        if (!btt->btt_queue)
1525                return -ENOMEM;
1526
1527        btt->btt_disk = alloc_disk(0);
1528        if (!btt->btt_disk) {
1529                blk_cleanup_queue(btt->btt_queue);
1530                return -ENOMEM;
1531        }
1532
1533        nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name);
1534        btt->btt_disk->first_minor = 0;
1535        btt->btt_disk->fops = &btt_fops;
1536        btt->btt_disk->private_data = btt;
1537        btt->btt_disk->queue = btt->btt_queue;
1538        btt->btt_disk->flags = GENHD_FL_EXT_DEVT;
1539        btt->btt_disk->queue->backing_dev_info->capabilities |=
1540                        BDI_CAP_SYNCHRONOUS_IO;
1541
1542        blk_queue_make_request(btt->btt_queue, btt_make_request);
1543        blk_queue_logical_block_size(btt->btt_queue, btt->sector_size);
1544        blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX);
1545        blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_queue);
1546        btt->btt_queue->queuedata = btt;
1547
1548        if (btt_meta_size(btt)) {
1549                int rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt));
1550
1551                if (rc) {
1552                        del_gendisk(btt->btt_disk);
1553                        put_disk(btt->btt_disk);
1554                        blk_cleanup_queue(btt->btt_queue);
1555                        return rc;
1556                }
1557        }
1558        set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9);
1559        device_add_disk(&btt->nd_btt->dev, btt->btt_disk);
1560        btt->nd_btt->size = btt->nlba * (u64)btt->sector_size;
1561        revalidate_disk(btt->btt_disk);
1562
1563        return 0;
1564}
1565
1566static void btt_blk_cleanup(struct btt *btt)
1567{
1568        del_gendisk(btt->btt_disk);
1569        put_disk(btt->btt_disk);
1570        blk_cleanup_queue(btt->btt_queue);
1571}
1572
1573/**
1574 * btt_init - initialize a block translation table for the given device
1575 * @nd_btt:     device with BTT geometry and backing device info
1576 * @rawsize:    raw size in bytes of the backing device
1577 * @lbasize:    lba size of the backing device
1578 * @uuid:       A uuid for the backing device - this is stored on media
1579 * @maxlane:    maximum number of parallel requests the device can handle
1580 *
1581 * Initialize a Block Translation Table on a backing device to provide
1582 * single sector power fail atomicity.
1583 *
1584 * Context:
1585 * Might sleep.
1586 *
1587 * Returns:
1588 * Pointer to a new struct btt on success, NULL on failure.
1589 */
1590static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize,
1591                u32 lbasize, u8 *uuid, struct nd_region *nd_region)
1592{
1593        int ret;
1594        struct btt *btt;
1595        struct nd_namespace_io *nsio;
1596        struct device *dev = &nd_btt->dev;
1597
1598        btt = devm_kzalloc(dev, sizeof(struct btt), GFP_KERNEL);
1599        if (!btt)
1600                return NULL;
1601
1602        btt->nd_btt = nd_btt;
1603        btt->rawsize = rawsize;
1604        btt->lbasize = lbasize;
1605        btt->sector_size = ((lbasize >= 4096) ? 4096 : 512);
1606        INIT_LIST_HEAD(&btt->arena_list);
1607        mutex_init(&btt->init_lock);
1608        btt->nd_region = nd_region;
1609        nsio = to_nd_namespace_io(&nd_btt->ndns->dev);
1610        btt->phys_bb = &nsio->bb;
1611
1612        ret = discover_arenas(btt);
1613        if (ret) {
1614                dev_err(dev, "init: error in arena_discover: %d\n", ret);
1615                return NULL;
1616        }
1617
1618        if (btt->init_state != INIT_READY && nd_region->ro) {
1619                dev_warn(dev, "%s is read-only, unable to init btt metadata\n",
1620                                dev_name(&nd_region->dev));
1621                return NULL;
1622        } else if (btt->init_state != INIT_READY) {
1623                btt->num_arenas = (rawsize / ARENA_MAX_SIZE) +
1624                        ((rawsize % ARENA_MAX_SIZE) ? 1 : 0);
1625                dev_dbg(dev, "init: %d arenas for %llu rawsize\n",
1626                                btt->num_arenas, rawsize);
1627
1628                ret = create_arenas(btt);
1629                if (ret) {
1630                        dev_info(dev, "init: create_arenas: %d\n", ret);
1631                        return NULL;
1632                }
1633
1634                ret = btt_meta_init(btt);
1635                if (ret) {
1636                        dev_err(dev, "init: error in meta_init: %d\n", ret);
1637                        return NULL;
1638                }
1639        }
1640
1641        ret = btt_blk_init(btt);
1642        if (ret) {
1643                dev_err(dev, "init: error in blk_init: %d\n", ret);
1644                return NULL;
1645        }
1646
1647        btt_debugfs_init(btt);
1648
1649        return btt;
1650}
1651
1652/**
1653 * btt_fini - de-initialize a BTT
1654 * @btt:        the BTT handle that was generated by btt_init
1655 *
1656 * De-initialize a Block Translation Table on device removal
1657 *
1658 * Context:
1659 * Might sleep.
1660 */
1661static void btt_fini(struct btt *btt)
1662{
1663        if (btt) {
1664                btt_blk_cleanup(btt);
1665                free_arenas(btt);
1666                debugfs_remove_recursive(btt->debugfs_dir);
1667        }
1668}
1669
1670int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns)
1671{
1672        struct nd_btt *nd_btt = to_nd_btt(ndns->claim);
1673        struct nd_region *nd_region;
1674        struct btt_sb *btt_sb;
1675        struct btt *btt;
1676        size_t rawsize;
1677
1678        if (!nd_btt->uuid || !nd_btt->ndns || !nd_btt->lbasize) {
1679                dev_dbg(&nd_btt->dev, "incomplete btt configuration\n");
1680                return -ENODEV;
1681        }
1682
1683        btt_sb = devm_kzalloc(&nd_btt->dev, sizeof(*btt_sb), GFP_KERNEL);
1684        if (!btt_sb)
1685                return -ENOMEM;
1686
1687        /*
1688         * If this returns < 0, that is ok as it just means there wasn't
1689         * an existing BTT, and we're creating a new one. We still need to
1690         * call this as we need the version dependent fields in nd_btt to be
1691         * set correctly based on the holder class
1692         */
1693        nd_btt_version(nd_btt, ndns, btt_sb);
1694
1695        rawsize = nvdimm_namespace_capacity(ndns) - nd_btt->initial_offset;
1696        if (rawsize < ARENA_MIN_SIZE) {
1697                dev_dbg(&nd_btt->dev, "%s must be at least %ld bytes\n",
1698                                dev_name(&ndns->dev),
1699                                ARENA_MIN_SIZE + nd_btt->initial_offset);
1700                return -ENXIO;
1701        }
1702        nd_region = to_nd_region(nd_btt->dev.parent);
1703        btt = btt_init(nd_btt, rawsize, nd_btt->lbasize, nd_btt->uuid,
1704                        nd_region);
1705        if (!btt)
1706                return -ENOMEM;
1707        nd_btt->btt = btt;
1708
1709        return 0;
1710}
1711EXPORT_SYMBOL(nvdimm_namespace_attach_btt);
1712
1713int nvdimm_namespace_detach_btt(struct nd_btt *nd_btt)
1714{
1715        struct btt *btt = nd_btt->btt;
1716
1717        btt_fini(btt);
1718        nd_btt->btt = NULL;
1719
1720        return 0;
1721}
1722EXPORT_SYMBOL(nvdimm_namespace_detach_btt);
1723
1724static int __init nd_btt_init(void)
1725{
1726        int rc = 0;
1727
1728        debugfs_root = debugfs_create_dir("btt", NULL);
1729        if (IS_ERR_OR_NULL(debugfs_root))
1730                rc = -ENXIO;
1731
1732        return rc;
1733}
1734
1735static void __exit nd_btt_exit(void)
1736{
1737        debugfs_remove_recursive(debugfs_root);
1738}
1739
1740MODULE_ALIAS_ND_DEVICE(ND_DEVICE_BTT);
1741MODULE_AUTHOR("Vishal Verma <vishal.l.verma@linux.intel.com>");
1742MODULE_LICENSE("GPL v2");
1743module_init(nd_btt_init);
1744module_exit(nd_btt_exit);
1745