linux/drivers/staging/rdma/hfi1/pio.c
<<
>>
Prefs
   1/*
   2 *
   3 * This file is provided under a dual BSD/GPLv2 license.  When using or
   4 * redistributing this file, you may do so under either license.
   5 *
   6 * GPL LICENSE SUMMARY
   7 *
   8 * Copyright(c) 2015 Intel Corporation.
   9 *
  10 * This program is free software; you can redistribute it and/or modify
  11 * it under the terms of version 2 of the GNU General Public License as
  12 * published by the Free Software Foundation.
  13 *
  14 * This program is distributed in the hope that it will be useful, but
  15 * WITHOUT ANY WARRANTY; without even the implied warranty of
  16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17 * General Public License for more details.
  18 *
  19 * BSD LICENSE
  20 *
  21 * Copyright(c) 2015 Intel Corporation.
  22 *
  23 * Redistribution and use in source and binary forms, with or without
  24 * modification, are permitted provided that the following conditions
  25 * are met:
  26 *
  27 *  - Redistributions of source code must retain the above copyright
  28 *    notice, this list of conditions and the following disclaimer.
  29 *  - Redistributions in binary form must reproduce the above copyright
  30 *    notice, this list of conditions and the following disclaimer in
  31 *    the documentation and/or other materials provided with the
  32 *    distribution.
  33 *  - Neither the name of Intel Corporation nor the names of its
  34 *    contributors may be used to endorse or promote products derived
  35 *    from this software without specific prior written permission.
  36 *
  37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  48 *
  49 */
  50
  51#include <linux/delay.h>
  52#include "hfi.h"
  53#include "qp.h"
  54#include "trace.h"
  55
  56#define SC_CTXT_PACKET_EGRESS_TIMEOUT 350 /* in chip cycles */
  57
  58#define SC(name) SEND_CTXT_##name
  59/*
  60 * Send Context functions
  61 */
  62static void sc_wait_for_packet_egress(struct send_context *sc, int pause);
  63
  64/*
  65 * Set the CM reset bit and wait for it to clear.  Use the provided
  66 * sendctrl register.  This routine has no locking.
  67 */
  68void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl)
  69{
  70        write_csr(dd, SEND_CTRL, sendctrl | SEND_CTRL_CM_RESET_SMASK);
  71        while (1) {
  72                udelay(1);
  73                sendctrl = read_csr(dd, SEND_CTRL);
  74                if ((sendctrl & SEND_CTRL_CM_RESET_SMASK) == 0)
  75                        break;
  76        }
  77}
  78
  79/* defined in header release 48 and higher */
  80#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT
  81#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3
  82#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull
  83#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \
  84                << SEND_CTRL_UNSUPPORTED_VL_SHIFT)
  85#endif
  86
  87/* global control of PIO send */
  88void pio_send_control(struct hfi1_devdata *dd, int op)
  89{
  90        u64 reg, mask;
  91        unsigned long flags;
  92        int write = 1;  /* write sendctrl back */
  93        int flush = 0;  /* re-read sendctrl to make sure it is flushed */
  94
  95        spin_lock_irqsave(&dd->sendctrl_lock, flags);
  96
  97        reg = read_csr(dd, SEND_CTRL);
  98        switch (op) {
  99        case PSC_GLOBAL_ENABLE:
 100                reg |= SEND_CTRL_SEND_ENABLE_SMASK;
 101        /* Fall through */
 102        case PSC_DATA_VL_ENABLE:
 103                /* Disallow sending on VLs not enabled */
 104                mask = (((~0ull)<<num_vls) & SEND_CTRL_UNSUPPORTED_VL_MASK)<<
 105                                SEND_CTRL_UNSUPPORTED_VL_SHIFT;
 106                reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask;
 107                break;
 108        case PSC_GLOBAL_DISABLE:
 109                reg &= ~SEND_CTRL_SEND_ENABLE_SMASK;
 110                break;
 111        case PSC_GLOBAL_VLARB_ENABLE:
 112                reg |= SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
 113                break;
 114        case PSC_GLOBAL_VLARB_DISABLE:
 115                reg &= ~SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
 116                break;
 117        case PSC_CM_RESET:
 118                __cm_reset(dd, reg);
 119                write = 0; /* CSR already written (and flushed) */
 120                break;
 121        case PSC_DATA_VL_DISABLE:
 122                reg |= SEND_CTRL_UNSUPPORTED_VL_SMASK;
 123                flush = 1;
 124                break;
 125        default:
 126                dd_dev_err(dd, "%s: invalid control %d\n", __func__, op);
 127                break;
 128        }
 129
 130        if (write) {
 131                write_csr(dd, SEND_CTRL, reg);
 132                if (flush)
 133                        (void) read_csr(dd, SEND_CTRL); /* flush write */
 134        }
 135
 136        spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
 137}
 138
 139/* number of send context memory pools */
 140#define NUM_SC_POOLS 2
 141
 142/* Send Context Size (SCS) wildcards */
 143#define SCS_POOL_0 -1
 144#define SCS_POOL_1 -2
 145/* Send Context Count (SCC) wildcards */
 146#define SCC_PER_VL -1
 147#define SCC_PER_CPU  -2
 148
 149#define SCC_PER_KRCVQ  -3
 150#define SCC_ACK_CREDITS  32
 151
 152#define PIO_WAIT_BATCH_SIZE 5
 153
 154/* default send context sizes */
 155static struct sc_config_sizes sc_config_sizes[SC_MAX] = {
 156        [SC_KERNEL] = { .size  = SCS_POOL_0,    /* even divide, pool 0 */
 157                        .count = SCC_PER_VL },/* one per NUMA */
 158        [SC_ACK]    = { .size  = SCC_ACK_CREDITS,
 159                        .count = SCC_PER_KRCVQ },
 160        [SC_USER]   = { .size  = SCS_POOL_0,    /* even divide, pool 0 */
 161                        .count = SCC_PER_CPU }, /* one per CPU */
 162
 163};
 164
 165/* send context memory pool configuration */
 166struct mem_pool_config {
 167        int centipercent;       /* % of memory, in 100ths of 1% */
 168        int absolute_blocks;    /* absolute block count */
 169};
 170
 171/* default memory pool configuration: 100% in pool 0 */
 172static struct mem_pool_config sc_mem_pool_config[NUM_SC_POOLS] = {
 173        /* centi%, abs blocks */
 174        {  10000,     -1 },             /* pool 0 */
 175        {      0,     -1 },             /* pool 1 */
 176};
 177
 178/* memory pool information, used when calculating final sizes */
 179struct mem_pool_info {
 180        int centipercent;       /* 100th of 1% of memory to use, -1 if blocks
 181                                   already set */
 182        int count;              /* count of contexts in the pool */
 183        int blocks;             /* block size of the pool */
 184        int size;               /* context size, in blocks */
 185};
 186
 187/*
 188 * Convert a pool wildcard to a valid pool index.  The wildcards
 189 * start at -1 and increase negatively.  Map them as:
 190 *      -1 => 0
 191 *      -2 => 1
 192 *      etc.
 193 *
 194 * Return -1 on non-wildcard input, otherwise convert to a pool number.
 195 */
 196static int wildcard_to_pool(int wc)
 197{
 198        if (wc >= 0)
 199                return -1;      /* non-wildcard */
 200        return -wc - 1;
 201}
 202
 203static const char *sc_type_names[SC_MAX] = {
 204        "kernel",
 205        "ack",
 206        "user"
 207};
 208
 209static const char *sc_type_name(int index)
 210{
 211        if (index < 0 || index >= SC_MAX)
 212                return "unknown";
 213        return sc_type_names[index];
 214}
 215
 216/*
 217 * Read the send context memory pool configuration and send context
 218 * size configuration.  Replace any wildcards and come up with final
 219 * counts and sizes for the send context types.
 220 */
 221int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
 222{
 223        struct mem_pool_info mem_pool_info[NUM_SC_POOLS] = { { 0 } };
 224        int total_blocks = (dd->chip_pio_mem_size / PIO_BLOCK_SIZE) - 1;
 225        int total_contexts = 0;
 226        int fixed_blocks;
 227        int pool_blocks;
 228        int used_blocks;
 229        int cp_total;           /* centipercent total */
 230        int ab_total;           /* absolute block total */
 231        int extra;
 232        int i;
 233
 234        /*
 235         * Step 0:
 236         *      - copy the centipercents/absolute sizes from the pool config
 237         *      - sanity check these values
 238         *      - add up centipercents, then later check for full value
 239         *      - add up absolute blocks, then later check for over-commit
 240         */
 241        cp_total = 0;
 242        ab_total = 0;
 243        for (i = 0; i < NUM_SC_POOLS; i++) {
 244                int cp = sc_mem_pool_config[i].centipercent;
 245                int ab = sc_mem_pool_config[i].absolute_blocks;
 246
 247                /*
 248                 * A negative value is "unused" or "invalid".  Both *can*
 249                 * be valid, but centipercent wins, so check that first
 250                 */
 251                if (cp >= 0) {                  /* centipercent valid */
 252                        cp_total += cp;
 253                } else if (ab >= 0) {           /* absolute blocks valid */
 254                        ab_total += ab;
 255                } else {                        /* neither valid */
 256                        dd_dev_err(
 257                                dd,
 258                                "Send context memory pool %d: both the block count and centipercent are invalid\n",
 259                                i);
 260                        return -EINVAL;
 261                }
 262
 263                mem_pool_info[i].centipercent = cp;
 264                mem_pool_info[i].blocks = ab;
 265        }
 266
 267        /* do not use both % and absolute blocks for different pools */
 268        if (cp_total != 0 && ab_total != 0) {
 269                dd_dev_err(
 270                        dd,
 271                        "All send context memory pools must be described as either centipercent or blocks, no mixing between pools\n");
 272                return -EINVAL;
 273        }
 274
 275        /* if any percentages are present, they must add up to 100% x 100 */
 276        if (cp_total != 0 && cp_total != 10000) {
 277                dd_dev_err(
 278                        dd,
 279                        "Send context memory pool centipercent is %d, expecting 10000\n",
 280                        cp_total);
 281                return -EINVAL;
 282        }
 283
 284        /* the absolute pool total cannot be more than the mem total */
 285        if (ab_total > total_blocks) {
 286                dd_dev_err(
 287                        dd,
 288                        "Send context memory pool absolute block count %d is larger than the memory size %d\n",
 289                        ab_total, total_blocks);
 290                return -EINVAL;
 291        }
 292
 293        /*
 294         * Step 2:
 295         *      - copy from the context size config
 296         *      - replace context type wildcard counts with real values
 297         *      - add up non-memory pool block sizes
 298         *      - add up memory pool user counts
 299         */
 300        fixed_blocks = 0;
 301        for (i = 0; i < SC_MAX; i++) {
 302                int count = sc_config_sizes[i].count;
 303                int size = sc_config_sizes[i].size;
 304                int pool;
 305
 306                /*
 307                 * Sanity check count: Either a positive value or
 308                 * one of the expected wildcards is valid.  The positive
 309                 * value is checked later when we compare against total
 310                 * memory available.
 311                 */
 312                if (i == SC_ACK) {
 313                        count = dd->n_krcv_queues;
 314                } else if (i == SC_KERNEL) {
 315                        count = num_vls + 1 /* VL15 */;
 316                } else if (count == SCC_PER_CPU) {
 317                        count = dd->num_rcv_contexts - dd->n_krcv_queues;
 318                } else if (count < 0) {
 319                        dd_dev_err(
 320                                dd,
 321                                "%s send context invalid count wildcard %d\n",
 322                                sc_type_name(i), count);
 323                        return -EINVAL;
 324                }
 325                if (total_contexts + count > dd->chip_send_contexts)
 326                        count = dd->chip_send_contexts - total_contexts;
 327
 328                total_contexts += count;
 329
 330                /*
 331                 * Sanity check pool: The conversion will return a pool
 332                 * number or -1 if a fixed (non-negative) value.  The fixed
 333                 * value is checked later when we compare against
 334                 * total memory available.
 335                 */
 336                pool = wildcard_to_pool(size);
 337                if (pool == -1) {                       /* non-wildcard */
 338                        fixed_blocks += size * count;
 339                } else if (pool < NUM_SC_POOLS) {       /* valid wildcard */
 340                        mem_pool_info[pool].count += count;
 341                } else {                                /* invalid wildcard */
 342                        dd_dev_err(
 343                                dd,
 344                                "%s send context invalid pool wildcard %d\n",
 345                                sc_type_name(i), size);
 346                        return -EINVAL;
 347                }
 348
 349                dd->sc_sizes[i].count = count;
 350                dd->sc_sizes[i].size = size;
 351        }
 352        if (fixed_blocks > total_blocks) {
 353                dd_dev_err(
 354                        dd,
 355                        "Send context fixed block count, %u, larger than total block count %u\n",
 356                        fixed_blocks, total_blocks);
 357                return -EINVAL;
 358        }
 359
 360        /* step 3: calculate the blocks in the pools, and pool context sizes */
 361        pool_blocks = total_blocks - fixed_blocks;
 362        if (ab_total > pool_blocks) {
 363                dd_dev_err(
 364                        dd,
 365                        "Send context fixed pool sizes, %u, larger than pool block count %u\n",
 366                        ab_total, pool_blocks);
 367                return -EINVAL;
 368        }
 369        /* subtract off the fixed pool blocks */
 370        pool_blocks -= ab_total;
 371
 372        for (i = 0; i < NUM_SC_POOLS; i++) {
 373                struct mem_pool_info *pi = &mem_pool_info[i];
 374
 375                /* % beats absolute blocks */
 376                if (pi->centipercent >= 0)
 377                        pi->blocks = (pool_blocks * pi->centipercent) / 10000;
 378
 379                if (pi->blocks == 0 && pi->count != 0) {
 380                        dd_dev_err(
 381                                dd,
 382                                "Send context memory pool %d has %u contexts, but no blocks\n",
 383                                i, pi->count);
 384                        return -EINVAL;
 385                }
 386                if (pi->count == 0) {
 387                        /* warn about wasted blocks */
 388                        if (pi->blocks != 0)
 389                                dd_dev_err(
 390                                        dd,
 391                                        "Send context memory pool %d has %u blocks, but zero contexts\n",
 392                                        i, pi->blocks);
 393                        pi->size = 0;
 394                } else {
 395                        pi->size = pi->blocks / pi->count;
 396                }
 397        }
 398
 399        /* step 4: fill in the context type sizes from the pool sizes */
 400        used_blocks = 0;
 401        for (i = 0; i < SC_MAX; i++) {
 402                if (dd->sc_sizes[i].size < 0) {
 403                        unsigned pool = wildcard_to_pool(dd->sc_sizes[i].size);
 404
 405                        WARN_ON_ONCE(pool >= NUM_SC_POOLS);
 406                        dd->sc_sizes[i].size = mem_pool_info[pool].size;
 407                }
 408                /* make sure we are not larger than what is allowed by the HW */
 409#define PIO_MAX_BLOCKS 1024
 410                if (dd->sc_sizes[i].size > PIO_MAX_BLOCKS)
 411                        dd->sc_sizes[i].size = PIO_MAX_BLOCKS;
 412
 413                /* calculate our total usage */
 414                used_blocks += dd->sc_sizes[i].size * dd->sc_sizes[i].count;
 415        }
 416        extra = total_blocks - used_blocks;
 417        if (extra != 0)
 418                dd_dev_info(dd, "unused send context blocks: %d\n", extra);
 419
 420        return total_contexts;
 421}
 422
 423int init_send_contexts(struct hfi1_devdata *dd)
 424{
 425        u16 base;
 426        int ret, i, j, context;
 427
 428        ret = init_credit_return(dd);
 429        if (ret)
 430                return ret;
 431
 432        dd->hw_to_sw = kmalloc_array(TXE_NUM_CONTEXTS, sizeof(u8),
 433                                        GFP_KERNEL);
 434        dd->send_contexts = kcalloc(dd->num_send_contexts,
 435                                        sizeof(struct send_context_info),
 436                                        GFP_KERNEL);
 437        if (!dd->send_contexts || !dd->hw_to_sw) {
 438                kfree(dd->hw_to_sw);
 439                kfree(dd->send_contexts);
 440                free_credit_return(dd);
 441                return -ENOMEM;
 442        }
 443
 444        /* hardware context map starts with invalid send context indices */
 445        for (i = 0; i < TXE_NUM_CONTEXTS; i++)
 446                dd->hw_to_sw[i] = INVALID_SCI;
 447
 448        /*
 449         * All send contexts have their credit sizes.  Allocate credits
 450         * for each context one after another from the global space.
 451         */
 452        context = 0;
 453        base = 1;
 454        for (i = 0; i < SC_MAX; i++) {
 455                struct sc_config_sizes *scs = &dd->sc_sizes[i];
 456
 457                for (j = 0; j < scs->count; j++) {
 458                        struct send_context_info *sci =
 459                                                &dd->send_contexts[context];
 460                        sci->type = i;
 461                        sci->base = base;
 462                        sci->credits = scs->size;
 463
 464                        context++;
 465                        base += scs->size;
 466                }
 467        }
 468
 469        return 0;
 470}
 471
 472/*
 473 * Allocate a software index and hardware context of the given type.
 474 *
 475 * Must be called with dd->sc_lock held.
 476 */
 477static int sc_hw_alloc(struct hfi1_devdata *dd, int type, u32 *sw_index,
 478                       u32 *hw_context)
 479{
 480        struct send_context_info *sci;
 481        u32 index;
 482        u32 context;
 483
 484        for (index = 0, sci = &dd->send_contexts[0];
 485                        index < dd->num_send_contexts; index++, sci++) {
 486                if (sci->type == type && sci->allocated == 0) {
 487                        sci->allocated = 1;
 488                        /* use a 1:1 mapping, but make them non-equal */
 489                        context = dd->chip_send_contexts - index - 1;
 490                        dd->hw_to_sw[context] = index;
 491                        *sw_index = index;
 492                        *hw_context = context;
 493                        return 0; /* success */
 494                }
 495        }
 496        dd_dev_err(dd, "Unable to locate a free type %d send context\n", type);
 497        return -ENOSPC;
 498}
 499
 500/*
 501 * Free the send context given by its software index.
 502 *
 503 * Must be called with dd->sc_lock held.
 504 */
 505static void sc_hw_free(struct hfi1_devdata *dd, u32 sw_index, u32 hw_context)
 506{
 507        struct send_context_info *sci;
 508
 509        sci = &dd->send_contexts[sw_index];
 510        if (!sci->allocated) {
 511                dd_dev_err(dd, "%s: sw_index %u not allocated? hw_context %u\n",
 512                        __func__, sw_index, hw_context);
 513        }
 514        sci->allocated = 0;
 515        dd->hw_to_sw[hw_context] = INVALID_SCI;
 516}
 517
 518/* return the base context of a context in a group */
 519static inline u32 group_context(u32 context, u32 group)
 520{
 521        return (context >> group) << group;
 522}
 523
 524/* return the size of a group */
 525static inline u32 group_size(u32 group)
 526{
 527        return 1 << group;
 528}
 529
 530/*
 531 * Obtain the credit return addresses, kernel virtual and physical, for the
 532 * given sc.
 533 *
 534 * To understand this routine:
 535 * o va and pa are arrays of struct credit_return.  One for each physical
 536 *   send context, per NUMA.
 537 * o Each send context always looks in its relative location in a struct
 538 *   credit_return for its credit return.
 539 * o Each send context in a group must have its return address CSR programmed
 540 *   with the same value.  Use the address of the first send context in the
 541 *   group.
 542 */
 543static void cr_group_addresses(struct send_context *sc, dma_addr_t *pa)
 544{
 545        u32 gc = group_context(sc->hw_context, sc->group);
 546        u32 index = sc->hw_context & 0x7;
 547
 548        sc->hw_free = &sc->dd->cr_base[sc->node].va[gc].cr[index];
 549        *pa = (unsigned long)
 550               &((struct credit_return *)sc->dd->cr_base[sc->node].pa)[gc];
 551}
 552
 553/*
 554 * Work queue function triggered in error interrupt routine for
 555 * kernel contexts.
 556 */
 557static void sc_halted(struct work_struct *work)
 558{
 559        struct send_context *sc;
 560
 561        sc = container_of(work, struct send_context, halt_work);
 562        sc_restart(sc);
 563}
 564
 565/*
 566 * Calculate PIO block threshold for this send context using the given MTU.
 567 * Trigger a return when one MTU plus optional header of credits remain.
 568 *
 569 * Parameter mtu is in bytes.
 570 * Parameter hdrqentsize is in DWORDs.
 571 *
 572 * Return value is what to write into the CSR: trigger return when
 573 * unreturned credits pass this count.
 574 */
 575u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize)
 576{
 577        u32 release_credits;
 578        u32 threshold;
 579
 580        /* add in the header size, then divide by the PIO block size */
 581        mtu += hdrqentsize << 2;
 582        release_credits = DIV_ROUND_UP(mtu, PIO_BLOCK_SIZE);
 583
 584        /* check against this context's credits */
 585        if (sc->credits <= release_credits)
 586                threshold = 1;
 587        else
 588                threshold = sc->credits - release_credits;
 589
 590        return threshold;
 591}
 592
 593/*
 594 * Calculate credit threshold in terms of percent of the allocated credits.
 595 * Trigger when unreturned credits equal or exceed the percentage of the whole.
 596 *
 597 * Return value is what to write into the CSR: trigger return when
 598 * unreturned credits pass this count.
 599 */
 600static u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
 601{
 602        return (sc->credits * percent) / 100;
 603}
 604
 605/*
 606 * Set the credit return threshold.
 607 */
 608void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold)
 609{
 610        unsigned long flags;
 611        u32 old_threshold;
 612        int force_return = 0;
 613
 614        spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
 615
 616        old_threshold = (sc->credit_ctrl >>
 617                                SC(CREDIT_CTRL_THRESHOLD_SHIFT))
 618                         & SC(CREDIT_CTRL_THRESHOLD_MASK);
 619
 620        if (new_threshold != old_threshold) {
 621                sc->credit_ctrl =
 622                        (sc->credit_ctrl
 623                                & ~SC(CREDIT_CTRL_THRESHOLD_SMASK))
 624                        | ((new_threshold
 625                                & SC(CREDIT_CTRL_THRESHOLD_MASK))
 626                           << SC(CREDIT_CTRL_THRESHOLD_SHIFT));
 627                write_kctxt_csr(sc->dd, sc->hw_context,
 628                        SC(CREDIT_CTRL), sc->credit_ctrl);
 629
 630                /* force a credit return on change to avoid a possible stall */
 631                force_return = 1;
 632        }
 633
 634        spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
 635
 636        if (force_return)
 637                sc_return_credits(sc);
 638}
 639
 640/*
 641 * set_pio_integrity
 642 *
 643 * Set the CHECK_ENABLE register for the send context 'sc'.
 644 */
 645void set_pio_integrity(struct send_context *sc)
 646{
 647        struct hfi1_devdata *dd = sc->dd;
 648        u64 reg = 0;
 649        u32 hw_context = sc->hw_context;
 650        int type = sc->type;
 651
 652        /*
 653         * No integrity checks if HFI1_CAP_NO_INTEGRITY is set, or if
 654         * we're snooping.
 655         */
 656        if (likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) &&
 657            dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE)
 658                reg = hfi1_pkt_default_send_ctxt_mask(dd, type);
 659
 660        write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), reg);
 661}
 662
 663/*
 664 * Allocate a NUMA relative send context structure of the given type along
 665 * with a HW context.
 666 */
 667struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
 668                              uint hdrqentsize, int numa)
 669{
 670        struct send_context_info *sci;
 671        struct send_context *sc;
 672        dma_addr_t pa;
 673        unsigned long flags;
 674        u64 reg;
 675        u32 thresh;
 676        u32 sw_index;
 677        u32 hw_context;
 678        int ret;
 679        u8 opval, opmask;
 680
 681        /* do not allocate while frozen */
 682        if (dd->flags & HFI1_FROZEN)
 683                return NULL;
 684
 685        sc = kzalloc_node(sizeof(struct send_context), GFP_KERNEL, numa);
 686        if (!sc)
 687                return NULL;
 688
 689        spin_lock_irqsave(&dd->sc_lock, flags);
 690        ret = sc_hw_alloc(dd, type, &sw_index, &hw_context);
 691        if (ret) {
 692                spin_unlock_irqrestore(&dd->sc_lock, flags);
 693                kfree(sc);
 694                return NULL;
 695        }
 696
 697        sci = &dd->send_contexts[sw_index];
 698        sci->sc = sc;
 699
 700        sc->dd = dd;
 701        sc->node = numa;
 702        sc->type = type;
 703        spin_lock_init(&sc->alloc_lock);
 704        spin_lock_init(&sc->release_lock);
 705        spin_lock_init(&sc->credit_ctrl_lock);
 706        INIT_LIST_HEAD(&sc->piowait);
 707        INIT_WORK(&sc->halt_work, sc_halted);
 708        atomic_set(&sc->buffers_allocated, 0);
 709        init_waitqueue_head(&sc->halt_wait);
 710
 711        /* grouping is always single context for now */
 712        sc->group = 0;
 713
 714        sc->sw_index = sw_index;
 715        sc->hw_context = hw_context;
 716        cr_group_addresses(sc, &pa);
 717        sc->credits = sci->credits;
 718
 719/* PIO Send Memory Address details */
 720#define PIO_ADDR_CONTEXT_MASK 0xfful
 721#define PIO_ADDR_CONTEXT_SHIFT 16
 722        sc->base_addr = dd->piobase + ((hw_context & PIO_ADDR_CONTEXT_MASK)
 723                                        << PIO_ADDR_CONTEXT_SHIFT);
 724
 725        /* set base and credits */
 726        reg = ((sci->credits & SC(CTRL_CTXT_DEPTH_MASK))
 727                                        << SC(CTRL_CTXT_DEPTH_SHIFT))
 728                | ((sci->base & SC(CTRL_CTXT_BASE_MASK))
 729                                        << SC(CTRL_CTXT_BASE_SHIFT));
 730        write_kctxt_csr(dd, hw_context, SC(CTRL), reg);
 731
 732        set_pio_integrity(sc);
 733
 734        /* unmask all errors */
 735        write_kctxt_csr(dd, hw_context, SC(ERR_MASK), (u64)-1);
 736
 737        /* set the default partition key */
 738        write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY),
 739                (DEFAULT_PKEY &
 740                        SC(CHECK_PARTITION_KEY_VALUE_MASK))
 741                    << SC(CHECK_PARTITION_KEY_VALUE_SHIFT));
 742
 743        /* per context type checks */
 744        if (type == SC_USER) {
 745                opval = USER_OPCODE_CHECK_VAL;
 746                opmask = USER_OPCODE_CHECK_MASK;
 747        } else {
 748                opval = OPCODE_CHECK_VAL_DISABLED;
 749                opmask = OPCODE_CHECK_MASK_DISABLED;
 750        }
 751
 752        /* set the send context check opcode mask and value */
 753        write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE),
 754                ((u64)opmask << SC(CHECK_OPCODE_MASK_SHIFT)) |
 755                ((u64)opval << SC(CHECK_OPCODE_VALUE_SHIFT)));
 756
 757        /* set up credit return */
 758        reg = pa & SC(CREDIT_RETURN_ADDR_ADDRESS_SMASK);
 759        write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), reg);
 760
 761        /*
 762         * Calculate the initial credit return threshold.
 763         *
 764         * For Ack contexts, set a threshold for half the credits.
 765         * For User contexts use the given percentage.  This has been
 766         * sanitized on driver start-up.
 767         * For Kernel contexts, use the default MTU plus a header.
 768         */
 769        if (type == SC_ACK) {
 770                thresh = sc_percent_to_threshold(sc, 50);
 771        } else if (type == SC_USER) {
 772                thresh = sc_percent_to_threshold(sc,
 773                                user_credit_return_threshold);
 774        } else { /* kernel */
 775                thresh = sc_mtu_to_threshold(sc, hfi1_max_mtu, hdrqentsize);
 776        }
 777        reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT);
 778        /* add in early return */
 779        if (type == SC_USER && HFI1_CAP_IS_USET(EARLY_CREDIT_RETURN))
 780                reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
 781        else if (HFI1_CAP_IS_KSET(EARLY_CREDIT_RETURN)) /* kernel, ack */
 782                reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
 783
 784        /* set up write-through credit_ctrl */
 785        sc->credit_ctrl = reg;
 786        write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), reg);
 787
 788        /* User send contexts should not allow sending on VL15 */
 789        if (type == SC_USER) {
 790                reg = 1ULL << 15;
 791                write_kctxt_csr(dd, hw_context, SC(CHECK_VL), reg);
 792        }
 793
 794        spin_unlock_irqrestore(&dd->sc_lock, flags);
 795
 796        /*
 797         * Allocate shadow ring to track outstanding PIO buffers _after_
 798         * unlocking.  We don't know the size until the lock is held and
 799         * we can't allocate while the lock is held.  No one is using
 800         * the context yet, so allocate it now.
 801         *
 802         * User contexts do not get a shadow ring.
 803         */
 804        if (type != SC_USER) {
 805                /*
 806                 * Size the shadow ring 1 larger than the number of credits
 807                 * so head == tail can mean empty.
 808                 */
 809                sc->sr_size = sci->credits + 1;
 810                sc->sr = kzalloc_node(sizeof(union pio_shadow_ring) *
 811                                sc->sr_size, GFP_KERNEL, numa);
 812                if (!sc->sr) {
 813                        sc_free(sc);
 814                        return NULL;
 815                }
 816        }
 817
 818        dd_dev_info(dd,
 819                "Send context %u(%u) %s group %u credits %u credit_ctrl 0x%llx threshold %u\n",
 820                sw_index,
 821                hw_context,
 822                sc_type_name(type),
 823                sc->group,
 824                sc->credits,
 825                sc->credit_ctrl,
 826                thresh);
 827
 828        return sc;
 829}
 830
 831/* free a per-NUMA send context structure */
 832void sc_free(struct send_context *sc)
 833{
 834        struct hfi1_devdata *dd;
 835        unsigned long flags;
 836        u32 sw_index;
 837        u32 hw_context;
 838
 839        if (!sc)
 840                return;
 841
 842        sc->flags |= SCF_IN_FREE;       /* ensure no restarts */
 843        dd = sc->dd;
 844        if (!list_empty(&sc->piowait))
 845                dd_dev_err(dd, "piowait list not empty!\n");
 846        sw_index = sc->sw_index;
 847        hw_context = sc->hw_context;
 848        sc_disable(sc); /* make sure the HW is disabled */
 849        flush_work(&sc->halt_work);
 850
 851        spin_lock_irqsave(&dd->sc_lock, flags);
 852        dd->send_contexts[sw_index].sc = NULL;
 853
 854        /* clear/disable all registers set in sc_alloc */
 855        write_kctxt_csr(dd, hw_context, SC(CTRL), 0);
 856        write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), 0);
 857        write_kctxt_csr(dd, hw_context, SC(ERR_MASK), 0);
 858        write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), 0);
 859        write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), 0);
 860        write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), 0);
 861        write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), 0);
 862
 863        /* release the index and context for re-use */
 864        sc_hw_free(dd, sw_index, hw_context);
 865        spin_unlock_irqrestore(&dd->sc_lock, flags);
 866
 867        kfree(sc->sr);
 868        kfree(sc);
 869}
 870
 871/* disable the context */
 872void sc_disable(struct send_context *sc)
 873{
 874        u64 reg;
 875        unsigned long flags;
 876        struct pio_buf *pbuf;
 877
 878        if (!sc)
 879                return;
 880
 881        /* do all steps, even if already disabled */
 882        spin_lock_irqsave(&sc->alloc_lock, flags);
 883        reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL));
 884        reg &= ~SC(CTRL_CTXT_ENABLE_SMASK);
 885        sc->flags &= ~SCF_ENABLED;
 886        sc_wait_for_packet_egress(sc, 1);
 887        write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg);
 888        spin_unlock_irqrestore(&sc->alloc_lock, flags);
 889
 890        /*
 891         * Flush any waiters.  Once the context is disabled,
 892         * credit return interrupts are stopped (although there
 893         * could be one in-process when the context is disabled).
 894         * Wait one microsecond for any lingering interrupts, then
 895         * proceed with the flush.
 896         */
 897        udelay(1);
 898        spin_lock_irqsave(&sc->release_lock, flags);
 899        if (sc->sr) {   /* this context has a shadow ring */
 900                while (sc->sr_tail != sc->sr_head) {
 901                        pbuf = &sc->sr[sc->sr_tail].pbuf;
 902                        if (pbuf->cb)
 903                                (*pbuf->cb)(pbuf->arg, PRC_SC_DISABLE);
 904                        sc->sr_tail++;
 905                        if (sc->sr_tail >= sc->sr_size)
 906                                sc->sr_tail = 0;
 907                }
 908        }
 909        spin_unlock_irqrestore(&sc->release_lock, flags);
 910}
 911
 912/* return SendEgressCtxtStatus.PacketOccupancy */
 913#define packet_occupancy(r) \
 914        (((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK)\
 915        >> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT)
 916
 917/* is egress halted on the context? */
 918#define egress_halted(r) \
 919        ((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK)
 920
 921/* wait for packet egress, optionally pause for credit return  */
 922static void sc_wait_for_packet_egress(struct send_context *sc, int pause)
 923{
 924        struct hfi1_devdata *dd = sc->dd;
 925        u64 reg = 0;
 926        u64 reg_prev;
 927        u32 loop = 0;
 928
 929        while (1) {
 930                reg_prev = reg;
 931                reg = read_csr(dd, sc->hw_context * 8 +
 932                               SEND_EGRESS_CTXT_STATUS);
 933                /* done if egress is stopped */
 934                if (egress_halted(reg))
 935                        break;
 936                reg = packet_occupancy(reg);
 937                if (reg == 0)
 938                        break;
 939                /* counter is reset if occupancy count changes */
 940                if (reg != reg_prev)
 941                        loop = 0;
 942                if (loop > 500) {
 943                        /* timed out - bounce the link */
 944                        dd_dev_err(dd,
 945                                "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u, bouncing link\n",
 946                                __func__, sc->sw_index,
 947                                sc->hw_context, (u32)reg);
 948                        queue_work(dd->pport->hfi1_wq,
 949                                &dd->pport->link_bounce_work);
 950                        break;
 951                }
 952                loop++;
 953                udelay(1);
 954        }
 955
 956        if (pause)
 957                /* Add additional delay to ensure chip returns all credits */
 958                pause_for_credit_return(dd);
 959}
 960
 961void sc_wait(struct hfi1_devdata *dd)
 962{
 963        int i;
 964
 965        for (i = 0; i < dd->num_send_contexts; i++) {
 966                struct send_context *sc = dd->send_contexts[i].sc;
 967
 968                if (!sc)
 969                        continue;
 970                sc_wait_for_packet_egress(sc, 0);
 971        }
 972}
 973
 974/*
 975 * Restart a context after it has been halted due to error.
 976 *
 977 * If the first step fails - wait for the halt to be asserted, return early.
 978 * Otherwise complain about timeouts but keep going.
 979 *
 980 * It is expected that allocations (enabled flag bit) have been shut off
 981 * already (only applies to kernel contexts).
 982 */
 983int sc_restart(struct send_context *sc)
 984{
 985        struct hfi1_devdata *dd = sc->dd;
 986        u64 reg;
 987        u32 loop;
 988        int count;
 989
 990        /* bounce off if not halted, or being free'd */
 991        if (!(sc->flags & SCF_HALTED) || (sc->flags & SCF_IN_FREE))
 992                return -EINVAL;
 993
 994        dd_dev_info(dd, "restarting send context %u(%u)\n", sc->sw_index,
 995                sc->hw_context);
 996
 997        /*
 998         * Step 1: Wait for the context to actually halt.
 999         *
1000         * The error interrupt is asynchronous to actually setting halt
1001         * on the context.
1002         */
1003        loop = 0;
1004        while (1) {
1005                reg = read_kctxt_csr(dd, sc->hw_context, SC(STATUS));
1006                if (reg & SC(STATUS_CTXT_HALTED_SMASK))
1007                        break;
1008                if (loop > 100) {
1009                        dd_dev_err(dd, "%s: context %u(%u) not halting, skipping\n",
1010                                __func__, sc->sw_index, sc->hw_context);
1011                        return -ETIME;
1012                }
1013                loop++;
1014                udelay(1);
1015        }
1016
1017        /*
1018         * Step 2: Ensure no users are still trying to write to PIO.
1019         *
1020         * For kernel contexts, we have already turned off buffer allocation.
1021         * Now wait for the buffer count to go to zero.
1022         *
1023         * For user contexts, the user handling code has cut off write access
1024         * to the context's PIO pages before calling this routine and will
1025         * restore write access after this routine returns.
1026         */
1027        if (sc->type != SC_USER) {
1028                /* kernel context */
1029                loop = 0;
1030                while (1) {
1031                        count = atomic_read(&sc->buffers_allocated);
1032                        if (count == 0)
1033                                break;
1034                        if (loop > 100) {
1035                                dd_dev_err(dd,
1036                                        "%s: context %u(%u) timeout waiting for PIO buffers to zero, remaining %d\n",
1037                                        __func__, sc->sw_index,
1038                                        sc->hw_context, count);
1039                        }
1040                        loop++;
1041                        udelay(1);
1042                }
1043        }
1044
1045        /*
1046         * Step 3: Wait for all packets to egress.
1047         * This is done while disabling the send context
1048         *
1049         * Step 4: Disable the context
1050         *
1051         * This is a superset of the halt.  After the disable, the
1052         * errors can be cleared.
1053         */
1054        sc_disable(sc);
1055
1056        /*
1057         * Step 5: Enable the context
1058         *
1059         * This enable will clear the halted flag and per-send context
1060         * error flags.
1061         */
1062        return sc_enable(sc);
1063}
1064
1065/*
1066 * PIO freeze processing.  To be called after the TXE block is fully frozen.
1067 * Go through all frozen send contexts and disable them.  The contexts are
1068 * already stopped by the freeze.
1069 */
1070void pio_freeze(struct hfi1_devdata *dd)
1071{
1072        struct send_context *sc;
1073        int i;
1074
1075        for (i = 0; i < dd->num_send_contexts; i++) {
1076                sc = dd->send_contexts[i].sc;
1077                /*
1078                 * Don't disable unallocated, unfrozen, or user send contexts.
1079                 * User send contexts will be disabled when the process
1080                 * calls into the driver to reset its context.
1081                 */
1082                if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
1083                        continue;
1084
1085                /* only need to disable, the context is already stopped */
1086                sc_disable(sc);
1087        }
1088}
1089
1090/*
1091 * Unfreeze PIO for kernel send contexts.  The precondition for calling this
1092 * is that all PIO send contexts have been disabled and the SPC freeze has
1093 * been cleared.  Now perform the last step and re-enable each kernel context.
1094 * User (PSM) processing will occur when PSM calls into the kernel to
1095 * acknowledge the freeze.
1096 */
1097void pio_kernel_unfreeze(struct hfi1_devdata *dd)
1098{
1099        struct send_context *sc;
1100        int i;
1101
1102        for (i = 0; i < dd->num_send_contexts; i++) {
1103                sc = dd->send_contexts[i].sc;
1104                if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
1105                        continue;
1106
1107                sc_enable(sc);  /* will clear the sc frozen flag */
1108        }
1109}
1110
1111/*
1112 * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear.
1113 * Returns:
1114 *      -ETIMEDOUT - if we wait too long
1115 *      -EIO       - if there was an error
1116 */
1117static int pio_init_wait_progress(struct hfi1_devdata *dd)
1118{
1119        u64 reg;
1120        int max, count = 0;
1121
1122        /* max is the longest possible HW init time / delay */
1123        max = (dd->icode == ICODE_FPGA_EMULATION) ? 120 : 5;
1124        while (1) {
1125                reg = read_csr(dd, SEND_PIO_INIT_CTXT);
1126                if (!(reg & SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK))
1127                        break;
1128                if (count >= max)
1129                        return -ETIMEDOUT;
1130                udelay(5);
1131                count++;
1132        }
1133
1134        return reg & SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK ? -EIO : 0;
1135}
1136
1137/*
1138 * Reset all of the send contexts to their power-on state.  Used
1139 * only during manual init - no lock against sc_enable needed.
1140 */
1141void pio_reset_all(struct hfi1_devdata *dd)
1142{
1143        int ret;
1144
1145        /* make sure the init engine is not busy */
1146        ret = pio_init_wait_progress(dd);
1147        /* ignore any timeout */
1148        if (ret == -EIO) {
1149                /* clear the error */
1150                write_csr(dd, SEND_PIO_ERR_CLEAR,
1151                        SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK);
1152        }
1153
1154        /* reset init all */
1155        write_csr(dd, SEND_PIO_INIT_CTXT,
1156                        SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK);
1157        udelay(2);
1158        ret = pio_init_wait_progress(dd);
1159        if (ret < 0) {
1160                dd_dev_err(dd,
1161                        "PIO send context init %s while initializing all PIO blocks\n",
1162                        ret == -ETIMEDOUT ? "is stuck" : "had an error");
1163        }
1164}
1165
1166/* enable the context */
1167int sc_enable(struct send_context *sc)
1168{
1169        u64 sc_ctrl, reg, pio;
1170        struct hfi1_devdata *dd;
1171        unsigned long flags;
1172        int ret = 0;
1173
1174        if (!sc)
1175                return -EINVAL;
1176        dd = sc->dd;
1177
1178        /*
1179         * Obtain the allocator lock to guard against any allocation
1180         * attempts (which should not happen prior to context being
1181         * enabled). On the release/disable side we don't need to
1182         * worry about locking since the releaser will not do anything
1183         * if the context accounting values have not changed.
1184         */
1185        spin_lock_irqsave(&sc->alloc_lock, flags);
1186        sc_ctrl = read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
1187        if ((sc_ctrl & SC(CTRL_CTXT_ENABLE_SMASK)))
1188                goto unlock; /* already enabled */
1189
1190        /* IMPORTANT: only clear free and fill if transitioning 0 -> 1 */
1191
1192        *sc->hw_free = 0;
1193        sc->free = 0;
1194        sc->alloc_free = 0;
1195        sc->fill = 0;
1196        sc->sr_head = 0;
1197        sc->sr_tail = 0;
1198        sc->flags = 0;
1199        atomic_set(&sc->buffers_allocated, 0);
1200
1201        /*
1202         * Clear all per-context errors.  Some of these will be set when
1203         * we are re-enabling after a context halt.  Now that the context
1204         * is disabled, the halt will not clear until after the PIO init
1205         * engine runs below.
1206         */
1207        reg = read_kctxt_csr(dd, sc->hw_context, SC(ERR_STATUS));
1208        if (reg)
1209                write_kctxt_csr(dd, sc->hw_context, SC(ERR_CLEAR),
1210                        reg);
1211
1212        /*
1213         * The HW PIO initialization engine can handle only one init
1214         * request at a time. Serialize access to each device's engine.
1215         */
1216        spin_lock(&dd->sc_init_lock);
1217        /*
1218         * Since access to this code block is serialized and
1219         * each access waits for the initialization to complete
1220         * before releasing the lock, the PIO initialization engine
1221         * should not be in use, so we don't have to wait for the
1222         * InProgress bit to go down.
1223         */
1224        pio = ((sc->hw_context & SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK) <<
1225               SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT) |
1226                SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK;
1227        write_csr(dd, SEND_PIO_INIT_CTXT, pio);
1228        /*
1229         * Wait until the engine is done.  Give the chip the required time
1230         * so, hopefully, we read the register just once.
1231         */
1232        udelay(2);
1233        ret = pio_init_wait_progress(dd);
1234        spin_unlock(&dd->sc_init_lock);
1235        if (ret) {
1236                dd_dev_err(dd,
1237                           "sctxt%u(%u): Context not enabled due to init failure %d\n",
1238                           sc->sw_index, sc->hw_context, ret);
1239                goto unlock;
1240        }
1241
1242        /*
1243         * All is well. Enable the context.
1244         */
1245        sc_ctrl |= SC(CTRL_CTXT_ENABLE_SMASK);
1246        write_kctxt_csr(dd, sc->hw_context, SC(CTRL), sc_ctrl);
1247        /*
1248         * Read SendCtxtCtrl to force the write out and prevent a timing
1249         * hazard where a PIO write may reach the context before the enable.
1250         */
1251        read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
1252        sc->flags |= SCF_ENABLED;
1253
1254unlock:
1255        spin_unlock_irqrestore(&sc->alloc_lock, flags);
1256
1257        return ret;
1258}
1259
1260/* force a credit return on the context */
1261void sc_return_credits(struct send_context *sc)
1262{
1263        if (!sc)
1264                return;
1265
1266        /* a 0->1 transition schedules a credit return */
1267        write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE),
1268                SC(CREDIT_FORCE_FORCE_RETURN_SMASK));
1269        /*
1270         * Ensure that the write is flushed and the credit return is
1271         * scheduled. We care more about the 0 -> 1 transition.
1272         */
1273        read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE));
1274        /* set back to 0 for next time */
1275        write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), 0);
1276}
1277
1278/* allow all in-flight packets to drain on the context */
1279void sc_flush(struct send_context *sc)
1280{
1281        if (!sc)
1282                return;
1283
1284        sc_wait_for_packet_egress(sc, 1);
1285}
1286
1287/* drop all packets on the context, no waiting until they are sent */
1288void sc_drop(struct send_context *sc)
1289{
1290        if (!sc)
1291                return;
1292
1293        dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n",
1294                        __func__, sc->sw_index, sc->hw_context);
1295}
1296
1297/*
1298 * Start the software reaction to a context halt or SPC freeze:
1299 *      - mark the context as halted or frozen
1300 *      - stop buffer allocations
1301 *
1302 * Called from the error interrupt.  Other work is deferred until
1303 * out of the interrupt.
1304 */
1305void sc_stop(struct send_context *sc, int flag)
1306{
1307        unsigned long flags;
1308
1309        /* mark the context */
1310        sc->flags |= flag;
1311
1312        /* stop buffer allocations */
1313        spin_lock_irqsave(&sc->alloc_lock, flags);
1314        sc->flags &= ~SCF_ENABLED;
1315        spin_unlock_irqrestore(&sc->alloc_lock, flags);
1316        wake_up(&sc->halt_wait);
1317}
1318
1319#define BLOCK_DWORDS (PIO_BLOCK_SIZE/sizeof(u32))
1320#define dwords_to_blocks(x) DIV_ROUND_UP(x, BLOCK_DWORDS)
1321
1322/*
1323 * The send context buffer "allocator".
1324 *
1325 * @sc: the PIO send context we are allocating from
1326 * @len: length of whole packet - including PBC - in dwords
1327 * @cb: optional callback to call when the buffer is finished sending
1328 * @arg: argument for cb
1329 *
1330 * Return a pointer to a PIO buffer if successful, NULL if not enough room.
1331 */
1332struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
1333                                pio_release_cb cb, void *arg)
1334{
1335        struct pio_buf *pbuf = NULL;
1336        unsigned long flags;
1337        unsigned long avail;
1338        unsigned long blocks = dwords_to_blocks(dw_len);
1339        unsigned long start_fill;
1340        int trycount = 0;
1341        u32 head, next;
1342
1343        spin_lock_irqsave(&sc->alloc_lock, flags);
1344        if (!(sc->flags & SCF_ENABLED)) {
1345                spin_unlock_irqrestore(&sc->alloc_lock, flags);
1346                goto done;
1347        }
1348
1349retry:
1350        avail = (unsigned long)sc->credits - (sc->fill - sc->alloc_free);
1351        if (blocks > avail) {
1352                /* not enough room */
1353                if (unlikely(trycount)) { /* already tried to get more room */
1354                        spin_unlock_irqrestore(&sc->alloc_lock, flags);
1355                        goto done;
1356                }
1357                /* copy from receiver cache line and recalculate */
1358                sc->alloc_free = ACCESS_ONCE(sc->free);
1359                avail =
1360                        (unsigned long)sc->credits -
1361                        (sc->fill - sc->alloc_free);
1362                if (blocks > avail) {
1363                        /* still no room, actively update */
1364                        spin_unlock_irqrestore(&sc->alloc_lock, flags);
1365                        sc_release_update(sc);
1366                        spin_lock_irqsave(&sc->alloc_lock, flags);
1367                        sc->alloc_free = ACCESS_ONCE(sc->free);
1368                        trycount++;
1369                        goto retry;
1370                }
1371        }
1372
1373        /* there is enough room */
1374
1375        atomic_inc(&sc->buffers_allocated);
1376
1377        /* read this once */
1378        head = sc->sr_head;
1379
1380        /* "allocate" the buffer */
1381        start_fill = sc->fill;
1382        sc->fill += blocks;
1383
1384        /*
1385         * Fill the parts that the releaser looks at before moving the head.
1386         * The only necessary piece is the sent_at field.  The credits
1387         * we have just allocated cannot have been returned yet, so the
1388         * cb and arg will not be looked at for a "while".  Put them
1389         * on this side of the memory barrier anyway.
1390         */
1391        pbuf = &sc->sr[head].pbuf;
1392        pbuf->sent_at = sc->fill;
1393        pbuf->cb = cb;
1394        pbuf->arg = arg;
1395        pbuf->sc = sc;  /* could be filled in at sc->sr init time */
1396        /* make sure this is in memory before updating the head */
1397
1398        /* calculate next head index, do not store */
1399        next = head + 1;
1400        if (next >= sc->sr_size)
1401                next = 0;
1402        /* update the head - must be last! - the releaser can look at fields
1403           in pbuf once we move the head */
1404        smp_wmb();
1405        sc->sr_head = next;
1406        spin_unlock_irqrestore(&sc->alloc_lock, flags);
1407
1408        /* finish filling in the buffer outside the lock */
1409        pbuf->start = sc->base_addr + ((start_fill % sc->credits)
1410                                                        * PIO_BLOCK_SIZE);
1411        pbuf->size = sc->credits * PIO_BLOCK_SIZE;
1412        pbuf->end = sc->base_addr + pbuf->size;
1413        pbuf->block_count = blocks;
1414        pbuf->qw_written = 0;
1415        pbuf->carry_bytes = 0;
1416        pbuf->carry.val64 = 0;
1417done:
1418        return pbuf;
1419}
1420
1421/*
1422 * There are at least two entities that can turn on credit return
1423 * interrupts and they can overlap.  Avoid problems by implementing
1424 * a count scheme that is enforced by a lock.  The lock is needed because
1425 * the count and CSR write must be paired.
1426 */
1427
1428/*
1429 * Start credit return interrupts.  This is managed by a count.  If already
1430 * on, just increment the count.
1431 */
1432void sc_add_credit_return_intr(struct send_context *sc)
1433{
1434        unsigned long flags;
1435
1436        /* lock must surround both the count change and the CSR update */
1437        spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
1438        if (sc->credit_intr_count == 0) {
1439                sc->credit_ctrl |= SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
1440                write_kctxt_csr(sc->dd, sc->hw_context,
1441                        SC(CREDIT_CTRL), sc->credit_ctrl);
1442        }
1443        sc->credit_intr_count++;
1444        spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
1445}
1446
1447/*
1448 * Stop credit return interrupts.  This is managed by a count.  Decrement the
1449 * count, if the last user, then turn the credit interrupts off.
1450 */
1451void sc_del_credit_return_intr(struct send_context *sc)
1452{
1453        unsigned long flags;
1454
1455        WARN_ON(sc->credit_intr_count == 0);
1456
1457        /* lock must surround both the count change and the CSR update */
1458        spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
1459        sc->credit_intr_count--;
1460        if (sc->credit_intr_count == 0) {
1461                sc->credit_ctrl &= ~SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
1462                write_kctxt_csr(sc->dd, sc->hw_context,
1463                        SC(CREDIT_CTRL), sc->credit_ctrl);
1464        }
1465        spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
1466}
1467
1468/*
1469 * The caller must be careful when calling this.  All needint calls
1470 * must be paired with !needint.
1471 */
1472void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint)
1473{
1474        if (needint)
1475                sc_add_credit_return_intr(sc);
1476        else
1477                sc_del_credit_return_intr(sc);
1478        trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl);
1479        if (needint) {
1480                mmiowb();
1481                sc_return_credits(sc);
1482        }
1483}
1484
1485/**
1486 * sc_piobufavail - callback when a PIO buffer is available
1487 * @sc: the send context
1488 *
1489 * This is called from the interrupt handler when a PIO buffer is
1490 * available after hfi1_verbs_send() returned an error that no buffers were
1491 * available. Disable the interrupt if there are no more QPs waiting.
1492 */
1493static void sc_piobufavail(struct send_context *sc)
1494{
1495        struct hfi1_devdata *dd = sc->dd;
1496        struct hfi1_ibdev *dev = &dd->verbs_dev;
1497        struct list_head *list;
1498        struct hfi1_qp *qps[PIO_WAIT_BATCH_SIZE];
1499        struct hfi1_qp *qp;
1500        unsigned long flags;
1501        unsigned i, n = 0;
1502
1503        if (dd->send_contexts[sc->sw_index].type != SC_KERNEL)
1504                return;
1505        list = &sc->piowait;
1506        /*
1507         * Note: checking that the piowait list is empty and clearing
1508         * the buffer available interrupt needs to be atomic or we
1509         * could end up with QPs on the wait list with the interrupt
1510         * disabled.
1511         */
1512        write_seqlock_irqsave(&dev->iowait_lock, flags);
1513        while (!list_empty(list)) {
1514                struct iowait *wait;
1515
1516                if (n == ARRAY_SIZE(qps))
1517                        goto full;
1518                wait = list_first_entry(list, struct iowait, list);
1519                qp = container_of(wait, struct hfi1_qp, s_iowait);
1520                list_del_init(&qp->s_iowait.list);
1521                /* refcount held until actual wake up */
1522                qps[n++] = qp;
1523        }
1524        /*
1525         * Counting: only call wantpiobuf_intr() if there were waiters and they
1526         * are now all gone.
1527         */
1528        if (n)
1529                hfi1_sc_wantpiobuf_intr(sc, 0);
1530full:
1531        write_sequnlock_irqrestore(&dev->iowait_lock, flags);
1532
1533        for (i = 0; i < n; i++)
1534                hfi1_qp_wakeup(qps[i], HFI1_S_WAIT_PIO);
1535}
1536
1537/* translate a send credit update to a bit code of reasons */
1538static inline int fill_code(u64 hw_free)
1539{
1540        int code = 0;
1541
1542        if (hw_free & CR_STATUS_SMASK)
1543                code |= PRC_STATUS_ERR;
1544        if (hw_free & CR_CREDIT_RETURN_DUE_TO_PBC_SMASK)
1545                code |= PRC_PBC;
1546        if (hw_free & CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK)
1547                code |= PRC_THRESHOLD;
1548        if (hw_free & CR_CREDIT_RETURN_DUE_TO_ERR_SMASK)
1549                code |= PRC_FILL_ERR;
1550        if (hw_free & CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK)
1551                code |= PRC_SC_DISABLE;
1552        return code;
1553}
1554
1555/* use the jiffies compare to get the wrap right */
1556#define sent_before(a, b) time_before(a, b)     /* a < b */
1557
1558/*
1559 * The send context buffer "releaser".
1560 */
1561void sc_release_update(struct send_context *sc)
1562{
1563        struct pio_buf *pbuf;
1564        u64 hw_free;
1565        u32 head, tail;
1566        unsigned long old_free;
1567        unsigned long extra;
1568        unsigned long flags;
1569        int code;
1570
1571        if (!sc)
1572                return;
1573
1574        spin_lock_irqsave(&sc->release_lock, flags);
1575        /* update free */
1576        hw_free = le64_to_cpu(*sc->hw_free);            /* volatile read */
1577        old_free = sc->free;
1578        extra = (((hw_free & CR_COUNTER_SMASK) >> CR_COUNTER_SHIFT)
1579                        - (old_free & CR_COUNTER_MASK))
1580                                & CR_COUNTER_MASK;
1581        sc->free = old_free + extra;
1582        trace_hfi1_piofree(sc, extra);
1583
1584        /* call sent buffer callbacks */
1585        code = -1;                              /* code not yet set */
1586        head = ACCESS_ONCE(sc->sr_head);        /* snapshot the head */
1587        tail = sc->sr_tail;
1588        while (head != tail) {
1589                pbuf = &sc->sr[tail].pbuf;
1590
1591                if (sent_before(sc->free, pbuf->sent_at)) {
1592                        /* not sent yet */
1593                        break;
1594                }
1595                if (pbuf->cb) {
1596                        if (code < 0) /* fill in code on first user */
1597                                code = fill_code(hw_free);
1598                        (*pbuf->cb)(pbuf->arg, code);
1599                }
1600
1601                tail++;
1602                if (tail >= sc->sr_size)
1603                        tail = 0;
1604        }
1605        /* update tail, in case we moved it */
1606        sc->sr_tail = tail;
1607        spin_unlock_irqrestore(&sc->release_lock, flags);
1608        sc_piobufavail(sc);
1609}
1610
1611/*
1612 * Send context group releaser.  Argument is the send context that caused
1613 * the interrupt.  Called from the send context interrupt handler.
1614 *
1615 * Call release on all contexts in the group.
1616 *
1617 * This routine takes the sc_lock without an irqsave because it is only
1618 * called from an interrupt handler.  Adjust if that changes.
1619 */
1620void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context)
1621{
1622        struct send_context *sc;
1623        u32 sw_index;
1624        u32 gc, gc_end;
1625
1626        spin_lock(&dd->sc_lock);
1627        sw_index = dd->hw_to_sw[hw_context];
1628        if (unlikely(sw_index >= dd->num_send_contexts)) {
1629                dd_dev_err(dd, "%s: invalid hw (%u) to sw (%u) mapping\n",
1630                        __func__, hw_context, sw_index);
1631                goto done;
1632        }
1633        sc = dd->send_contexts[sw_index].sc;
1634        if (unlikely(!sc))
1635                goto done;
1636
1637        gc = group_context(hw_context, sc->group);
1638        gc_end = gc + group_size(sc->group);
1639        for (; gc < gc_end; gc++) {
1640                sw_index = dd->hw_to_sw[gc];
1641                if (unlikely(sw_index >= dd->num_send_contexts)) {
1642                        dd_dev_err(dd,
1643                                "%s: invalid hw (%u) to sw (%u) mapping\n",
1644                                __func__, hw_context, sw_index);
1645                        continue;
1646                }
1647                sc_release_update(dd->send_contexts[sw_index].sc);
1648        }
1649done:
1650        spin_unlock(&dd->sc_lock);
1651}
1652
1653int init_pervl_scs(struct hfi1_devdata *dd)
1654{
1655        int i;
1656        u64 mask, all_vl_mask = (u64) 0x80ff; /* VLs 0-7, 15 */
1657        u32 ctxt;
1658
1659        dd->vld[15].sc = sc_alloc(dd, SC_KERNEL,
1660                                  dd->rcd[0]->rcvhdrqentsize, dd->node);
1661        if (!dd->vld[15].sc)
1662                goto nomem;
1663        hfi1_init_ctxt(dd->vld[15].sc);
1664        dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048);
1665        for (i = 0; i < num_vls; i++) {
1666                /*
1667                 * Since this function does not deal with a specific
1668                 * receive context but we need the RcvHdrQ entry size,
1669                 * use the size from rcd[0]. It is guaranteed to be
1670                 * valid at this point and will remain the same for all
1671                 * receive contexts.
1672                 */
1673                dd->vld[i].sc = sc_alloc(dd, SC_KERNEL,
1674                                         dd->rcd[0]->rcvhdrqentsize, dd->node);
1675                if (!dd->vld[i].sc)
1676                        goto nomem;
1677
1678                hfi1_init_ctxt(dd->vld[i].sc);
1679
1680                /* non VL15 start with the max MTU */
1681                dd->vld[i].mtu = hfi1_max_mtu;
1682        }
1683        sc_enable(dd->vld[15].sc);
1684        ctxt = dd->vld[15].sc->hw_context;
1685        mask = all_vl_mask & ~(1LL << 15);
1686        write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
1687        dd_dev_info(dd,
1688                    "Using send context %u(%u) for VL15\n",
1689                    dd->vld[15].sc->sw_index, ctxt);
1690        for (i = 0; i < num_vls; i++) {
1691                sc_enable(dd->vld[i].sc);
1692                ctxt = dd->vld[i].sc->hw_context;
1693                mask = all_vl_mask & ~(1LL << i);
1694                write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
1695        }
1696        return 0;
1697nomem:
1698        sc_free(dd->vld[15].sc);
1699        for (i = 0; i < num_vls; i++)
1700                sc_free(dd->vld[i].sc);
1701        return -ENOMEM;
1702}
1703
1704int init_credit_return(struct hfi1_devdata *dd)
1705{
1706        int ret;
1707        int num_numa;
1708        int i;
1709
1710        num_numa = num_online_nodes();
1711        /* enforce the expectation that the numas are compact */
1712        for (i = 0; i < num_numa; i++) {
1713                if (!node_online(i)) {
1714                        dd_dev_err(dd, "NUMA nodes are not compact\n");
1715                        ret = -EINVAL;
1716                        goto done;
1717                }
1718        }
1719
1720        dd->cr_base = kcalloc(
1721                num_numa,
1722                sizeof(struct credit_return_base),
1723                GFP_KERNEL);
1724        if (!dd->cr_base) {
1725                dd_dev_err(dd, "Unable to allocate credit return base\n");
1726                ret = -ENOMEM;
1727                goto done;
1728        }
1729        for (i = 0; i < num_numa; i++) {
1730                int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return);
1731
1732                set_dev_node(&dd->pcidev->dev, i);
1733                dd->cr_base[i].va = dma_zalloc_coherent(
1734                                        &dd->pcidev->dev,
1735                                        bytes,
1736                                        &dd->cr_base[i].pa,
1737                                        GFP_KERNEL);
1738                if (dd->cr_base[i].va == NULL) {
1739                        set_dev_node(&dd->pcidev->dev, dd->node);
1740                        dd_dev_err(dd,
1741                                "Unable to allocate credit return DMA range for NUMA %d\n",
1742                                i);
1743                        ret = -ENOMEM;
1744                        goto done;
1745                }
1746        }
1747        set_dev_node(&dd->pcidev->dev, dd->node);
1748
1749        ret = 0;
1750done:
1751        return ret;
1752}
1753
1754void free_credit_return(struct hfi1_devdata *dd)
1755{
1756        int num_numa;
1757        int i;
1758
1759        if (!dd->cr_base)
1760                return;
1761
1762        num_numa = num_online_nodes();
1763        for (i = 0; i < num_numa; i++) {
1764                if (dd->cr_base[i].va) {
1765                        dma_free_coherent(&dd->pcidev->dev,
1766                                TXE_NUM_CONTEXTS
1767                                        * sizeof(struct credit_return),
1768                                dd->cr_base[i].va,
1769                                dd->cr_base[i].pa);
1770                }
1771        }
1772        kfree(dd->cr_base);
1773        dd->cr_base = NULL;
1774}
1775