linux/drivers/md/dm-stats.c
<<
>>
Prefs
   1#include <linux/errno.h>
   2#include <linux/numa.h>
   3#include <linux/slab.h>
   4#include <linux/rculist.h>
   5#include <linux/threads.h>
   6#include <linux/preempt.h>
   7#include <linux/irqflags.h>
   8#include <linux/vmalloc.h>
   9#include <linux/mm.h>
  10#include <linux/module.h>
  11#include <linux/device-mapper.h>
  12
  13#include "dm.h"
  14#include "dm-stats.h"
  15
  16#define DM_MSG_PREFIX "stats"
  17
  18static int dm_stat_need_rcu_barrier;
  19
  20/*
  21 * Using 64-bit values to avoid overflow (which is a
  22 * problem that block/genhd.c's IO accounting has).
  23 */
  24struct dm_stat_percpu {
  25        unsigned long long sectors[2];
  26        unsigned long long ios[2];
  27        unsigned long long merges[2];
  28        unsigned long long ticks[2];
  29        unsigned long long io_ticks[2];
  30        unsigned long long io_ticks_total;
  31        unsigned long long time_in_queue;
  32};
  33
  34struct dm_stat_shared {
  35        atomic_t in_flight[2];
  36        unsigned long stamp;
  37        struct dm_stat_percpu tmp;
  38};
  39
  40struct dm_stat {
  41        struct list_head list_entry;
  42        int id;
  43        size_t n_entries;
  44        sector_t start;
  45        sector_t end;
  46        sector_t step;
  47        const char *program_id;
  48        const char *aux_data;
  49        struct rcu_head rcu_head;
  50        size_t shared_alloc_size;
  51        size_t percpu_alloc_size;
  52        struct dm_stat_percpu *stat_percpu[NR_CPUS];
  53        struct dm_stat_shared stat_shared[0];
  54};
  55
  56struct dm_stats_last_position {
  57        sector_t last_sector;
  58        unsigned last_rw;
  59};
  60
  61/*
  62 * A typo on the command line could possibly make the kernel run out of memory
  63 * and crash. To prevent the crash we account all used memory. We fail if we
  64 * exhaust 1/4 of all memory or 1/2 of vmalloc space.
  65 */
  66#define DM_STATS_MEMORY_FACTOR          4
  67#define DM_STATS_VMALLOC_FACTOR         2
  68
  69static DEFINE_SPINLOCK(shared_memory_lock);
  70
  71static unsigned long shared_memory_amount;
  72
  73static bool __check_shared_memory(size_t alloc_size)
  74{
  75        size_t a;
  76
  77        a = shared_memory_amount + alloc_size;
  78        if (a < shared_memory_amount)
  79                return false;
  80        if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR)
  81                return false;
  82#ifdef CONFIG_MMU
  83        if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR)
  84                return false;
  85#endif
  86        return true;
  87}
  88
  89static bool check_shared_memory(size_t alloc_size)
  90{
  91        bool ret;
  92
  93        spin_lock_irq(&shared_memory_lock);
  94
  95        ret = __check_shared_memory(alloc_size);
  96
  97        spin_unlock_irq(&shared_memory_lock);
  98
  99        return ret;
 100}
 101
 102static bool claim_shared_memory(size_t alloc_size)
 103{
 104        spin_lock_irq(&shared_memory_lock);
 105
 106        if (!__check_shared_memory(alloc_size)) {
 107                spin_unlock_irq(&shared_memory_lock);
 108                return false;
 109        }
 110
 111        shared_memory_amount += alloc_size;
 112
 113        spin_unlock_irq(&shared_memory_lock);
 114
 115        return true;
 116}
 117
 118static void free_shared_memory(size_t alloc_size)
 119{
 120        unsigned long flags;
 121
 122        spin_lock_irqsave(&shared_memory_lock, flags);
 123
 124        if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) {
 125                spin_unlock_irqrestore(&shared_memory_lock, flags);
 126                DMCRIT("Memory usage accounting bug.");
 127                return;
 128        }
 129
 130        shared_memory_amount -= alloc_size;
 131
 132        spin_unlock_irqrestore(&shared_memory_lock, flags);
 133}
 134
 135static void *dm_kvzalloc(size_t alloc_size, int node)
 136{
 137        void *p;
 138
 139        if (!claim_shared_memory(alloc_size))
 140                return NULL;
 141
 142        if (alloc_size <= KMALLOC_MAX_SIZE) {
 143                p = kzalloc_node(alloc_size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node);
 144                if (p)
 145                        return p;
 146        }
 147        p = vzalloc_node(alloc_size, node);
 148        if (p)
 149                return p;
 150
 151        free_shared_memory(alloc_size);
 152
 153        return NULL;
 154}
 155
 156static void dm_kvfree(void *ptr, size_t alloc_size)
 157{
 158        if (!ptr)
 159                return;
 160
 161        free_shared_memory(alloc_size);
 162
 163        if (is_vmalloc_addr(ptr))
 164                vfree(ptr);
 165        else
 166                kfree(ptr);
 167}
 168
 169static void dm_stat_free(struct rcu_head *head)
 170{
 171        int cpu;
 172        struct dm_stat *s = container_of(head, struct dm_stat, rcu_head);
 173
 174        kfree(s->program_id);
 175        kfree(s->aux_data);
 176        for_each_possible_cpu(cpu)
 177                dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
 178        dm_kvfree(s, s->shared_alloc_size);
 179}
 180
 181static int dm_stat_in_flight(struct dm_stat_shared *shared)
 182{
 183        return atomic_read(&shared->in_flight[READ]) +
 184               atomic_read(&shared->in_flight[WRITE]);
 185}
 186
 187void dm_stats_init(struct dm_stats *stats)
 188{
 189        int cpu;
 190        struct dm_stats_last_position *last;
 191
 192        mutex_init(&stats->mutex);
 193        INIT_LIST_HEAD(&stats->list);
 194        stats->last = alloc_percpu(struct dm_stats_last_position);
 195        for_each_possible_cpu(cpu) {
 196                last = per_cpu_ptr(stats->last, cpu);
 197                last->last_sector = (sector_t)ULLONG_MAX;
 198                last->last_rw = UINT_MAX;
 199        }
 200}
 201
 202void dm_stats_cleanup(struct dm_stats *stats)
 203{
 204        size_t ni;
 205        struct dm_stat *s;
 206        struct dm_stat_shared *shared;
 207
 208        while (!list_empty(&stats->list)) {
 209                s = container_of(stats->list.next, struct dm_stat, list_entry);
 210                list_del(&s->list_entry);
 211                for (ni = 0; ni < s->n_entries; ni++) {
 212                        shared = &s->stat_shared[ni];
 213                        if (WARN_ON(dm_stat_in_flight(shared))) {
 214                                DMCRIT("leaked in-flight counter at index %lu "
 215                                       "(start %llu, end %llu, step %llu): reads %d, writes %d",
 216                                       (unsigned long)ni,
 217                                       (unsigned long long)s->start,
 218                                       (unsigned long long)s->end,
 219                                       (unsigned long long)s->step,
 220                                       atomic_read(&shared->in_flight[READ]),
 221                                       atomic_read(&shared->in_flight[WRITE]));
 222                        }
 223                }
 224                dm_stat_free(&s->rcu_head);
 225        }
 226        free_percpu(stats->last);
 227}
 228
 229static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 230                           sector_t step, const char *program_id, const char *aux_data,
 231                           void (*suspend_callback)(struct mapped_device *),
 232                           void (*resume_callback)(struct mapped_device *),
 233                           struct mapped_device *md)
 234{
 235        struct list_head *l;
 236        struct dm_stat *s, *tmp_s;
 237        sector_t n_entries;
 238        size_t ni;
 239        size_t shared_alloc_size;
 240        size_t percpu_alloc_size;
 241        struct dm_stat_percpu *p;
 242        int cpu;
 243        int ret_id;
 244        int r;
 245
 246        if (end < start || !step)
 247                return -EINVAL;
 248
 249        n_entries = end - start;
 250        if (dm_sector_div64(n_entries, step))
 251                n_entries++;
 252
 253        if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1))
 254                return -EOVERFLOW;
 255
 256        shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared);
 257        if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries)
 258                return -EOVERFLOW;
 259
 260        percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu);
 261        if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
 262                return -EOVERFLOW;
 263
 264        if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size))
 265                return -ENOMEM;
 266
 267        s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
 268        if (!s)
 269                return -ENOMEM;
 270
 271        s->n_entries = n_entries;
 272        s->start = start;
 273        s->end = end;
 274        s->step = step;
 275        s->shared_alloc_size = shared_alloc_size;
 276        s->percpu_alloc_size = percpu_alloc_size;
 277
 278        s->program_id = kstrdup(program_id, GFP_KERNEL);
 279        if (!s->program_id) {
 280                r = -ENOMEM;
 281                goto out;
 282        }
 283        s->aux_data = kstrdup(aux_data, GFP_KERNEL);
 284        if (!s->aux_data) {
 285                r = -ENOMEM;
 286                goto out;
 287        }
 288
 289        for (ni = 0; ni < n_entries; ni++) {
 290                atomic_set(&s->stat_shared[ni].in_flight[READ], 0);
 291                atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
 292        }
 293
 294        for_each_possible_cpu(cpu) {
 295                p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
 296                if (!p) {
 297                        r = -ENOMEM;
 298                        goto out;
 299                }
 300                s->stat_percpu[cpu] = p;
 301        }
 302
 303        /*
 304         * Suspend/resume to make sure there is no i/o in flight,
 305         * so that newly created statistics will be exact.
 306         *
 307         * (note: we couldn't suspend earlier because we must not
 308         * allocate memory while suspended)
 309         */
 310        suspend_callback(md);
 311
 312        mutex_lock(&stats->mutex);
 313        s->id = 0;
 314        list_for_each(l, &stats->list) {
 315                tmp_s = container_of(l, struct dm_stat, list_entry);
 316                if (WARN_ON(tmp_s->id < s->id)) {
 317                        r = -EINVAL;
 318                        goto out_unlock_resume;
 319                }
 320                if (tmp_s->id > s->id)
 321                        break;
 322                if (unlikely(s->id == INT_MAX)) {
 323                        r = -ENFILE;
 324                        goto out_unlock_resume;
 325                }
 326                s->id++;
 327        }
 328        ret_id = s->id;
 329        list_add_tail_rcu(&s->list_entry, l);
 330        mutex_unlock(&stats->mutex);
 331
 332        resume_callback(md);
 333
 334        return ret_id;
 335
 336out_unlock_resume:
 337        mutex_unlock(&stats->mutex);
 338        resume_callback(md);
 339out:
 340        dm_stat_free(&s->rcu_head);
 341        return r;
 342}
 343
 344static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id)
 345{
 346        struct dm_stat *s;
 347
 348        list_for_each_entry(s, &stats->list, list_entry) {
 349                if (s->id > id)
 350                        break;
 351                if (s->id == id)
 352                        return s;
 353        }
 354
 355        return NULL;
 356}
 357
 358static int dm_stats_delete(struct dm_stats *stats, int id)
 359{
 360        struct dm_stat *s;
 361        int cpu;
 362
 363        mutex_lock(&stats->mutex);
 364
 365        s = __dm_stats_find(stats, id);
 366        if (!s) {
 367                mutex_unlock(&stats->mutex);
 368                return -ENOENT;
 369        }
 370
 371        list_del_rcu(&s->list_entry);
 372        mutex_unlock(&stats->mutex);
 373
 374        /*
 375         * vfree can't be called from RCU callback
 376         */
 377        for_each_possible_cpu(cpu)
 378                if (is_vmalloc_addr(s->stat_percpu))
 379                        goto do_sync_free;
 380        if (is_vmalloc_addr(s)) {
 381do_sync_free:
 382                synchronize_rcu_expedited();
 383                dm_stat_free(&s->rcu_head);
 384        } else {
 385                ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1;
 386                call_rcu(&s->rcu_head, dm_stat_free);
 387        }
 388        return 0;
 389}
 390
 391static int dm_stats_list(struct dm_stats *stats, const char *program,
 392                         char *result, unsigned maxlen)
 393{
 394        struct dm_stat *s;
 395        sector_t len;
 396        unsigned sz = 0;
 397
 398        /*
 399         * Output format:
 400         *   <region_id>: <start_sector>+<length> <step> <program_id> <aux_data>
 401         */
 402
 403        mutex_lock(&stats->mutex);
 404        list_for_each_entry(s, &stats->list, list_entry) {
 405                if (!program || !strcmp(program, s->program_id)) {
 406                        len = s->end - s->start;
 407                        DMEMIT("%d: %llu+%llu %llu %s %s\n", s->id,
 408                                (unsigned long long)s->start,
 409                                (unsigned long long)len,
 410                                (unsigned long long)s->step,
 411                                s->program_id,
 412                                s->aux_data);
 413                }
 414        }
 415        mutex_unlock(&stats->mutex);
 416
 417        return 1;
 418}
 419
 420static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *p)
 421{
 422        /*
 423         * This is racy, but so is part_round_stats_single.
 424         */
 425        unsigned long now = jiffies;
 426        unsigned in_flight_read;
 427        unsigned in_flight_write;
 428        unsigned long difference = now - shared->stamp;
 429
 430        if (!difference)
 431                return;
 432        in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]);
 433        in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]);
 434        if (in_flight_read)
 435                p->io_ticks[READ] += difference;
 436        if (in_flight_write)
 437                p->io_ticks[WRITE] += difference;
 438        if (in_flight_read + in_flight_write) {
 439                p->io_ticks_total += difference;
 440                p->time_in_queue += (in_flight_read + in_flight_write) * difference;
 441        }
 442        shared->stamp = now;
 443}
 444
 445static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
 446                              unsigned long bi_rw, sector_t len, bool merged,
 447                              bool end, unsigned long duration)
 448{
 449        unsigned long idx = bi_rw & REQ_WRITE;
 450        struct dm_stat_shared *shared = &s->stat_shared[entry];
 451        struct dm_stat_percpu *p;
 452
 453        /*
 454         * For strict correctness we should use local_irq_save/restore
 455         * instead of preempt_disable/enable.
 456         *
 457         * preempt_disable/enable is racy if the driver finishes bios
 458         * from non-interrupt context as well as from interrupt context
 459         * or from more different interrupts.
 460         *
 461         * On 64-bit architectures the race only results in not counting some
 462         * events, so it is acceptable.  On 32-bit architectures the race could
 463         * cause the counter going off by 2^32, so we need to do proper locking
 464         * there.
 465         *
 466         * part_stat_lock()/part_stat_unlock() have this race too.
 467         */
 468#if BITS_PER_LONG == 32
 469        unsigned long flags;
 470        local_irq_save(flags);
 471#else
 472        preempt_disable();
 473#endif
 474        p = &s->stat_percpu[smp_processor_id()][entry];
 475
 476        if (!end) {
 477                dm_stat_round(shared, p);
 478                atomic_inc(&shared->in_flight[idx]);
 479        } else {
 480                dm_stat_round(shared, p);
 481                atomic_dec(&shared->in_flight[idx]);
 482                p->sectors[idx] += len;
 483                p->ios[idx] += 1;
 484                p->merges[idx] += merged;
 485                p->ticks[idx] += duration;
 486        }
 487
 488#if BITS_PER_LONG == 32
 489        local_irq_restore(flags);
 490#else
 491        preempt_enable();
 492#endif
 493}
 494
 495static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
 496                          sector_t bi_sector, sector_t end_sector,
 497                          bool end, unsigned long duration,
 498                          struct dm_stats_aux *stats_aux)
 499{
 500        sector_t rel_sector, offset, todo, fragment_len;
 501        size_t entry;
 502
 503        if (end_sector <= s->start || bi_sector >= s->end)
 504                return;
 505        if (unlikely(bi_sector < s->start)) {
 506                rel_sector = 0;
 507                todo = end_sector - s->start;
 508        } else {
 509                rel_sector = bi_sector - s->start;
 510                todo = end_sector - bi_sector;
 511        }
 512        if (unlikely(end_sector > s->end))
 513                todo -= (end_sector - s->end);
 514
 515        offset = dm_sector_div64(rel_sector, s->step);
 516        entry = rel_sector;
 517        do {
 518                if (WARN_ON_ONCE(entry >= s->n_entries)) {
 519                        DMCRIT("Invalid area access in region id %d", s->id);
 520                        return;
 521                }
 522                fragment_len = todo;
 523                if (fragment_len > s->step - offset)
 524                        fragment_len = s->step - offset;
 525                dm_stat_for_entry(s, entry, bi_rw, fragment_len,
 526                                  stats_aux->merged, end, duration);
 527                todo -= fragment_len;
 528                entry++;
 529                offset = 0;
 530        } while (unlikely(todo != 0));
 531}
 532
 533void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
 534                         sector_t bi_sector, unsigned bi_sectors, bool end,
 535                         unsigned long duration, struct dm_stats_aux *stats_aux)
 536{
 537        struct dm_stat *s;
 538        sector_t end_sector;
 539        struct dm_stats_last_position *last;
 540
 541        if (unlikely(!bi_sectors))
 542                return;
 543
 544        end_sector = bi_sector + bi_sectors;
 545
 546        if (!end) {
 547                /*
 548                 * A race condition can at worst result in the merged flag being
 549                 * misrepresented, so we don't have to disable preemption here.
 550                 */
 551                last = __this_cpu_ptr(stats->last);
 552                stats_aux->merged =
 553                        (bi_sector == (ACCESS_ONCE(last->last_sector) &&
 554                                       ((bi_rw & (REQ_WRITE | REQ_DISCARD)) ==
 555                                        (ACCESS_ONCE(last->last_rw) & (REQ_WRITE | REQ_DISCARD)))
 556                                       ));
 557                ACCESS_ONCE(last->last_sector) = end_sector;
 558                ACCESS_ONCE(last->last_rw) = bi_rw;
 559        }
 560
 561        rcu_read_lock();
 562
 563        list_for_each_entry_rcu(s, &stats->list, list_entry)
 564                __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration, stats_aux);
 565
 566        rcu_read_unlock();
 567}
 568
 569static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared,
 570                                                   struct dm_stat *s, size_t x)
 571{
 572        int cpu;
 573        struct dm_stat_percpu *p;
 574
 575        local_irq_disable();
 576        p = &s->stat_percpu[smp_processor_id()][x];
 577        dm_stat_round(shared, p);
 578        local_irq_enable();
 579
 580        memset(&shared->tmp, 0, sizeof(shared->tmp));
 581        for_each_possible_cpu(cpu) {
 582                p = &s->stat_percpu[cpu][x];
 583                shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
 584                shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]);
 585                shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]);
 586                shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]);
 587                shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]);
 588                shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]);
 589                shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]);
 590                shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]);
 591                shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]);
 592                shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
 593                shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
 594                shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
 595        }
 596}
 597
 598static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
 599                            bool init_tmp_percpu_totals)
 600{
 601        size_t x;
 602        struct dm_stat_shared *shared;
 603        struct dm_stat_percpu *p;
 604
 605        for (x = idx_start; x < idx_end; x++) {
 606                shared = &s->stat_shared[x];
 607                if (init_tmp_percpu_totals)
 608                        __dm_stat_init_temporary_percpu_totals(shared, s, x);
 609                local_irq_disable();
 610                p = &s->stat_percpu[smp_processor_id()][x];
 611                p->sectors[READ] -= shared->tmp.sectors[READ];
 612                p->sectors[WRITE] -= shared->tmp.sectors[WRITE];
 613                p->ios[READ] -= shared->tmp.ios[READ];
 614                p->ios[WRITE] -= shared->tmp.ios[WRITE];
 615                p->merges[READ] -= shared->tmp.merges[READ];
 616                p->merges[WRITE] -= shared->tmp.merges[WRITE];
 617                p->ticks[READ] -= shared->tmp.ticks[READ];
 618                p->ticks[WRITE] -= shared->tmp.ticks[WRITE];
 619                p->io_ticks[READ] -= shared->tmp.io_ticks[READ];
 620                p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE];
 621                p->io_ticks_total -= shared->tmp.io_ticks_total;
 622                p->time_in_queue -= shared->tmp.time_in_queue;
 623                local_irq_enable();
 624        }
 625}
 626
 627static int dm_stats_clear(struct dm_stats *stats, int id)
 628{
 629        struct dm_stat *s;
 630
 631        mutex_lock(&stats->mutex);
 632
 633        s = __dm_stats_find(stats, id);
 634        if (!s) {
 635                mutex_unlock(&stats->mutex);
 636                return -ENOENT;
 637        }
 638
 639        __dm_stat_clear(s, 0, s->n_entries, true);
 640
 641        mutex_unlock(&stats->mutex);
 642
 643        return 1;
 644}
 645
 646/*
 647 * This is like jiffies_to_msec, but works for 64-bit values.
 648 */
 649static unsigned long long dm_jiffies_to_msec64(unsigned long long j)
 650{
 651        unsigned long long result = 0;
 652        unsigned mult;
 653
 654        if (j)
 655                result = jiffies_to_msecs(j & 0x3fffff);
 656        if (j >= 1 << 22) {
 657                mult = jiffies_to_msecs(1 << 22);
 658                result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff);
 659        }
 660        if (j >= 1ULL << 44)
 661                result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44);
 662
 663        return result;
 664}
 665
 666static int dm_stats_print(struct dm_stats *stats, int id,
 667                          size_t idx_start, size_t idx_len,
 668                          bool clear, char *result, unsigned maxlen)
 669{
 670        unsigned sz = 0;
 671        struct dm_stat *s;
 672        size_t x;
 673        sector_t start, end, step;
 674        size_t idx_end;
 675        struct dm_stat_shared *shared;
 676
 677        /*
 678         * Output format:
 679         *   <start_sector>+<length> counters
 680         */
 681
 682        mutex_lock(&stats->mutex);
 683
 684        s = __dm_stats_find(stats, id);
 685        if (!s) {
 686                mutex_unlock(&stats->mutex);
 687                return -ENOENT;
 688        }
 689
 690        idx_end = idx_start + idx_len;
 691        if (idx_end < idx_start ||
 692            idx_end > s->n_entries)
 693                idx_end = s->n_entries;
 694
 695        if (idx_start > idx_end)
 696                idx_start = idx_end;
 697
 698        step = s->step;
 699        start = s->start + (step * idx_start);
 700
 701        for (x = idx_start; x < idx_end; x++, start = end) {
 702                shared = &s->stat_shared[x];
 703                end = start + step;
 704                if (unlikely(end > s->end))
 705                        end = s->end;
 706
 707                __dm_stat_init_temporary_percpu_totals(shared, s, x);
 708
 709                DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n",
 710                       (unsigned long long)start,
 711                       (unsigned long long)step,
 712                       shared->tmp.ios[READ],
 713                       shared->tmp.merges[READ],
 714                       shared->tmp.sectors[READ],
 715                       dm_jiffies_to_msec64(shared->tmp.ticks[READ]),
 716                       shared->tmp.ios[WRITE],
 717                       shared->tmp.merges[WRITE],
 718                       shared->tmp.sectors[WRITE],
 719                       dm_jiffies_to_msec64(shared->tmp.ticks[WRITE]),
 720                       dm_stat_in_flight(shared),
 721                       dm_jiffies_to_msec64(shared->tmp.io_ticks_total),
 722                       dm_jiffies_to_msec64(shared->tmp.time_in_queue),
 723                       dm_jiffies_to_msec64(shared->tmp.io_ticks[READ]),
 724                       dm_jiffies_to_msec64(shared->tmp.io_ticks[WRITE]));
 725
 726                if (unlikely(sz + 1 >= maxlen))
 727                        goto buffer_overflow;
 728        }
 729
 730        if (clear)
 731                __dm_stat_clear(s, idx_start, idx_end, false);
 732
 733buffer_overflow:
 734        mutex_unlock(&stats->mutex);
 735
 736        return 1;
 737}
 738
 739static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data)
 740{
 741        struct dm_stat *s;
 742        const char *new_aux_data;
 743
 744        mutex_lock(&stats->mutex);
 745
 746        s = __dm_stats_find(stats, id);
 747        if (!s) {
 748                mutex_unlock(&stats->mutex);
 749                return -ENOENT;
 750        }
 751
 752        new_aux_data = kstrdup(aux_data, GFP_KERNEL);
 753        if (!new_aux_data) {
 754                mutex_unlock(&stats->mutex);
 755                return -ENOMEM;
 756        }
 757
 758        kfree(s->aux_data);
 759        s->aux_data = new_aux_data;
 760
 761        mutex_unlock(&stats->mutex);
 762
 763        return 0;
 764}
 765
 766static int message_stats_create(struct mapped_device *md,
 767                                unsigned argc, char **argv,
 768                                char *result, unsigned maxlen)
 769{
 770        int id;
 771        char dummy;
 772        unsigned long long start, end, len, step;
 773        unsigned divisor;
 774        const char *program_id, *aux_data;
 775
 776        /*
 777         * Input format:
 778         *   <range> <step> [<program_id> [<aux_data>]]
 779         */
 780
 781        if (argc < 3 || argc > 5)
 782                return -EINVAL;
 783
 784        if (!strcmp(argv[1], "-")) {
 785                start = 0;
 786                len = dm_get_size(md);
 787                if (!len)
 788                        len = 1;
 789        } else if (sscanf(argv[1], "%llu+%llu%c", &start, &len, &dummy) != 2 ||
 790                   start != (sector_t)start || len != (sector_t)len)
 791                return -EINVAL;
 792
 793        end = start + len;
 794        if (start >= end)
 795                return -EINVAL;
 796
 797        if (sscanf(argv[2], "/%u%c", &divisor, &dummy) == 1) {
 798                step = end - start;
 799                if (do_div(step, divisor))
 800                        step++;
 801                if (!step)
 802                        step = 1;
 803        } else if (sscanf(argv[2], "%llu%c", &step, &dummy) != 1 ||
 804                   step != (sector_t)step || !step)
 805                return -EINVAL;
 806
 807        program_id = "-";
 808        aux_data = "-";
 809
 810        if (argc > 3)
 811                program_id = argv[3];
 812
 813        if (argc > 4)
 814                aux_data = argv[4];
 815
 816        /*
 817         * If a buffer overflow happens after we created the region,
 818         * it's too late (the userspace would retry with a larger
 819         * buffer, but the region id that caused the overflow is already
 820         * leaked).  So we must detect buffer overflow in advance.
 821         */
 822        snprintf(result, maxlen, "%d", INT_MAX);
 823        if (dm_message_test_buffer_overflow(result, maxlen))
 824                return 1;
 825
 826        id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data,
 827                             dm_internal_suspend, dm_internal_resume, md);
 828        if (id < 0)
 829                return id;
 830
 831        snprintf(result, maxlen, "%d", id);
 832
 833        return 1;
 834}
 835
 836static int message_stats_delete(struct mapped_device *md,
 837                                unsigned argc, char **argv)
 838{
 839        int id;
 840        char dummy;
 841
 842        if (argc != 2)
 843                return -EINVAL;
 844
 845        if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
 846                return -EINVAL;
 847
 848        return dm_stats_delete(dm_get_stats(md), id);
 849}
 850
 851static int message_stats_clear(struct mapped_device *md,
 852                               unsigned argc, char **argv)
 853{
 854        int id;
 855        char dummy;
 856
 857        if (argc != 2)
 858                return -EINVAL;
 859
 860        if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
 861                return -EINVAL;
 862
 863        return dm_stats_clear(dm_get_stats(md), id);
 864}
 865
 866static int message_stats_list(struct mapped_device *md,
 867                              unsigned argc, char **argv,
 868                              char *result, unsigned maxlen)
 869{
 870        int r;
 871        const char *program = NULL;
 872
 873        if (argc < 1 || argc > 2)
 874                return -EINVAL;
 875
 876        if (argc > 1) {
 877                program = kstrdup(argv[1], GFP_KERNEL);
 878                if (!program)
 879                        return -ENOMEM;
 880        }
 881
 882        r = dm_stats_list(dm_get_stats(md), program, result, maxlen);
 883
 884        kfree(program);
 885
 886        return r;
 887}
 888
 889static int message_stats_print(struct mapped_device *md,
 890                               unsigned argc, char **argv, bool clear,
 891                               char *result, unsigned maxlen)
 892{
 893        int id;
 894        char dummy;
 895        unsigned long idx_start = 0, idx_len = ULONG_MAX;
 896
 897        if (argc != 2 && argc != 4)
 898                return -EINVAL;
 899
 900        if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
 901                return -EINVAL;
 902
 903        if (argc > 3) {
 904                if (strcmp(argv[2], "-") &&
 905                    sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1)
 906                        return -EINVAL;
 907                if (strcmp(argv[3], "-") &&
 908                    sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1)
 909                        return -EINVAL;
 910        }
 911
 912        return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear,
 913                              result, maxlen);
 914}
 915
 916static int message_stats_set_aux(struct mapped_device *md,
 917                                 unsigned argc, char **argv)
 918{
 919        int id;
 920        char dummy;
 921
 922        if (argc != 3)
 923                return -EINVAL;
 924
 925        if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
 926                return -EINVAL;
 927
 928        return dm_stats_set_aux(dm_get_stats(md), id, argv[2]);
 929}
 930
 931int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
 932                     char *result, unsigned maxlen)
 933{
 934        int r;
 935
 936        if (dm_request_based(md)) {
 937                DMWARN("Statistics are only supported for bio-based devices");
 938                return -EOPNOTSUPP;
 939        }
 940
 941        /* All messages here must start with '@' */
 942        if (!strcasecmp(argv[0], "@stats_create"))
 943                r = message_stats_create(md, argc, argv, result, maxlen);
 944        else if (!strcasecmp(argv[0], "@stats_delete"))
 945                r = message_stats_delete(md, argc, argv);
 946        else if (!strcasecmp(argv[0], "@stats_clear"))
 947                r = message_stats_clear(md, argc, argv);
 948        else if (!strcasecmp(argv[0], "@stats_list"))
 949                r = message_stats_list(md, argc, argv, result, maxlen);
 950        else if (!strcasecmp(argv[0], "@stats_print"))
 951                r = message_stats_print(md, argc, argv, false, result, maxlen);
 952        else if (!strcasecmp(argv[0], "@stats_print_clear"))
 953                r = message_stats_print(md, argc, argv, true, result, maxlen);
 954        else if (!strcasecmp(argv[0], "@stats_set_aux"))
 955                r = message_stats_set_aux(md, argc, argv);
 956        else
 957                return 2; /* this wasn't a stats message */
 958
 959        if (r == -EINVAL)
 960                DMWARN("Invalid parameters for message %s", argv[0]);
 961
 962        return r;
 963}
 964
 965int __init dm_statistics_init(void)
 966{
 967        shared_memory_amount = 0;
 968        dm_stat_need_rcu_barrier = 0;
 969        return 0;
 970}
 971
 972void dm_statistics_exit(void)
 973{
 974        if (dm_stat_need_rcu_barrier)
 975                rcu_barrier();
 976        if (WARN_ON(shared_memory_amount))
 977                DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount);
 978}
 979
 980module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO);
 981MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics");
 982