linux/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
<<
>>
Prefs
   1/*
   2 * Copyright 2019 Advanced Micro Devices, Inc.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice shall be included in
  12 * all copies or substantial portions of the Software.
  13 *
  14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20 * OTHER DEALINGS IN THE SOFTWARE.
  21 *
  22 */
  23
  24#include "amdgpu_ras_eeprom.h"
  25#include "amdgpu.h"
  26#include "amdgpu_ras.h"
  27#include <linux/bits.h>
  28#include "atom.h"
  29
  30#define EEPROM_I2C_TARGET_ADDR_VEGA20           0xA0
  31#define EEPROM_I2C_TARGET_ADDR_ARCTURUS         0xA8
  32#define EEPROM_I2C_TARGET_ADDR_ARCTURUS_D342    0xA0
  33
  34/*
  35 * The 2 macros bellow represent the actual size in bytes that
  36 * those entities occupy in the EEPROM memory.
  37 * EEPROM_TABLE_RECORD_SIZE is different than sizeof(eeprom_table_record) which
  38 * uses uint64 to store 6b fields such as retired_page.
  39 */
  40#define EEPROM_TABLE_HEADER_SIZE 20
  41#define EEPROM_TABLE_RECORD_SIZE 24
  42
  43#define EEPROM_ADDRESS_SIZE 0x2
  44
  45/* Table hdr is 'AMDR' */
  46#define EEPROM_TABLE_HDR_VAL 0x414d4452
  47#define EEPROM_TABLE_VER 0x00010000
  48
  49/* Assume 2 Mbit size */
  50#define EEPROM_SIZE_BYTES 256000
  51#define EEPROM_PAGE__SIZE_BYTES 256
  52#define EEPROM_HDR_START 0
  53#define EEPROM_RECORD_START (EEPROM_HDR_START + EEPROM_TABLE_HEADER_SIZE)
  54#define EEPROM_MAX_RECORD_NUM ((EEPROM_SIZE_BYTES - EEPROM_TABLE_HEADER_SIZE) / EEPROM_TABLE_RECORD_SIZE)
  55#define EEPROM_ADDR_MSB_MASK GENMASK(17, 8)
  56
  57#define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, eeprom_control))->adev
  58
  59static bool __get_eeprom_i2c_addr_arct(struct amdgpu_device *adev,
  60                                       uint16_t *i2c_addr)
  61{
  62        struct atom_context *atom_ctx = adev->mode_info.atom_context;
  63
  64        if (!i2c_addr || !atom_ctx)
  65                return false;
  66
  67        if (strnstr(atom_ctx->vbios_version,
  68                    "D342",
  69                    sizeof(atom_ctx->vbios_version)))
  70                *i2c_addr = EEPROM_I2C_TARGET_ADDR_ARCTURUS_D342;
  71        else
  72                *i2c_addr = EEPROM_I2C_TARGET_ADDR_ARCTURUS;
  73
  74        return true;
  75}
  76
  77static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev,
  78                                  uint16_t *i2c_addr)
  79{
  80        if (!i2c_addr)
  81                return false;
  82
  83        switch (adev->asic_type) {
  84        case CHIP_VEGA20:
  85                *i2c_addr = EEPROM_I2C_TARGET_ADDR_VEGA20;
  86                break;
  87
  88        case CHIP_ARCTURUS:
  89                return __get_eeprom_i2c_addr_arct(adev, i2c_addr);
  90
  91        default:
  92                return false;
  93        }
  94
  95        return true;
  96}
  97
  98static void __encode_table_header_to_buff(struct amdgpu_ras_eeprom_table_header *hdr,
  99                                          unsigned char *buff)
 100{
 101        uint32_t *pp = (uint32_t *) buff;
 102
 103        pp[0] = cpu_to_le32(hdr->header);
 104        pp[1] = cpu_to_le32(hdr->version);
 105        pp[2] = cpu_to_le32(hdr->first_rec_offset);
 106        pp[3] = cpu_to_le32(hdr->tbl_size);
 107        pp[4] = cpu_to_le32(hdr->checksum);
 108}
 109
 110static void __decode_table_header_from_buff(struct amdgpu_ras_eeprom_table_header *hdr,
 111                                          unsigned char *buff)
 112{
 113        uint32_t *pp = (uint32_t *)buff;
 114
 115        hdr->header           = le32_to_cpu(pp[0]);
 116        hdr->version          = le32_to_cpu(pp[1]);
 117        hdr->first_rec_offset = le32_to_cpu(pp[2]);
 118        hdr->tbl_size         = le32_to_cpu(pp[3]);
 119        hdr->checksum         = le32_to_cpu(pp[4]);
 120}
 121
 122static int __update_table_header(struct amdgpu_ras_eeprom_control *control,
 123                                 unsigned char *buff)
 124{
 125        int ret = 0;
 126        struct amdgpu_device *adev = to_amdgpu_device(control);
 127        struct i2c_msg msg = {
 128                        .addr   = 0,
 129                        .flags  = 0,
 130                        .len    = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE,
 131                        .buf    = buff,
 132        };
 133
 134
 135        *(uint16_t *)buff = EEPROM_HDR_START;
 136        __encode_table_header_to_buff(&control->tbl_hdr, buff + EEPROM_ADDRESS_SIZE);
 137
 138        msg.addr = control->i2c_address;
 139
 140        ret = i2c_transfer(&adev->pm.smu_i2c, &msg, 1);
 141        if (ret < 1)
 142                DRM_ERROR("Failed to write EEPROM table header, ret:%d", ret);
 143
 144        return ret;
 145}
 146
 147static uint32_t  __calc_hdr_byte_sum(struct amdgpu_ras_eeprom_control *control)
 148{
 149        int i;
 150        uint32_t tbl_sum = 0;
 151
 152        /* Header checksum, skip checksum field in the calculation */
 153        for (i = 0; i < sizeof(control->tbl_hdr) - sizeof(control->tbl_hdr.checksum); i++)
 154                tbl_sum += *(((unsigned char *)&control->tbl_hdr) + i);
 155
 156        return tbl_sum;
 157}
 158
 159static uint32_t  __calc_recs_byte_sum(struct eeprom_table_record *records,
 160                                      int num)
 161{
 162        int i, j;
 163        uint32_t tbl_sum = 0;
 164
 165        /* Records checksum */
 166        for (i = 0; i < num; i++) {
 167                struct eeprom_table_record *record = &records[i];
 168
 169                for (j = 0; j < sizeof(*record); j++) {
 170                        tbl_sum += *(((unsigned char *)record) + j);
 171                }
 172        }
 173
 174        return tbl_sum;
 175}
 176
 177static inline uint32_t  __calc_tbl_byte_sum(struct amdgpu_ras_eeprom_control *control,
 178                                  struct eeprom_table_record *records, int num)
 179{
 180        return __calc_hdr_byte_sum(control) + __calc_recs_byte_sum(records, num);
 181}
 182
 183/* Checksum = 256 -((sum of all table entries) mod 256) */
 184static void __update_tbl_checksum(struct amdgpu_ras_eeprom_control *control,
 185                                  struct eeprom_table_record *records, int num,
 186                                  uint32_t old_hdr_byte_sum)
 187{
 188        /*
 189         * This will update the table sum with new records.
 190         *
 191         * TODO: What happens when the EEPROM table is to be wrapped around
 192         * and old records from start will get overridden.
 193         */
 194
 195        /* need to recalculate updated header byte sum */
 196        control->tbl_byte_sum -= old_hdr_byte_sum;
 197        control->tbl_byte_sum += __calc_tbl_byte_sum(control, records, num);
 198
 199        control->tbl_hdr.checksum = 256 - (control->tbl_byte_sum % 256);
 200}
 201
 202/* table sum mod 256 + checksum must equals 256 */
 203static bool __validate_tbl_checksum(struct amdgpu_ras_eeprom_control *control,
 204                            struct eeprom_table_record *records, int num)
 205{
 206        control->tbl_byte_sum = __calc_tbl_byte_sum(control, records, num);
 207
 208        if (control->tbl_hdr.checksum + (control->tbl_byte_sum % 256) != 256) {
 209                DRM_WARN("Checksum mismatch, checksum: %u ", control->tbl_hdr.checksum);
 210                return false;
 211        }
 212
 213        return true;
 214}
 215
 216int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
 217{
 218        unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE] = { 0 };
 219        struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
 220        int ret = 0;
 221
 222        mutex_lock(&control->tbl_mutex);
 223
 224        hdr->header = EEPROM_TABLE_HDR_VAL;
 225        hdr->version = EEPROM_TABLE_VER;
 226        hdr->first_rec_offset = EEPROM_RECORD_START;
 227        hdr->tbl_size = EEPROM_TABLE_HEADER_SIZE;
 228
 229        control->tbl_byte_sum = 0;
 230        __update_tbl_checksum(control, NULL, 0, 0);
 231        control->next_addr = EEPROM_RECORD_START;
 232
 233        ret = __update_table_header(control, buff);
 234
 235        mutex_unlock(&control->tbl_mutex);
 236
 237        return ret;
 238
 239}
 240
 241int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
 242{
 243        int ret = 0;
 244        struct amdgpu_device *adev = to_amdgpu_device(control);
 245        unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE] = { 0 };
 246        struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
 247        struct i2c_msg msg = {
 248                        .addr   = 0,
 249                        .flags  = I2C_M_RD,
 250                        .len    = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE,
 251                        .buf    = buff,
 252        };
 253
 254        /* Verify i2c adapter is initialized */
 255        if (!adev->pm.smu_i2c.algo)
 256                return -ENOENT;
 257
 258        if (!__get_eeprom_i2c_addr(adev, &control->i2c_address))
 259                return -EINVAL;
 260
 261        mutex_init(&control->tbl_mutex);
 262
 263        msg.addr = control->i2c_address;
 264        /* Read/Create table header from EEPROM address 0 */
 265        ret = i2c_transfer(&adev->pm.smu_i2c, &msg, 1);
 266        if (ret < 1) {
 267                DRM_ERROR("Failed to read EEPROM table header, ret:%d", ret);
 268                return ret;
 269        }
 270
 271        __decode_table_header_from_buff(hdr, &buff[2]);
 272
 273        if (hdr->header == EEPROM_TABLE_HDR_VAL) {
 274                control->num_recs = (hdr->tbl_size - EEPROM_TABLE_HEADER_SIZE) /
 275                                    EEPROM_TABLE_RECORD_SIZE;
 276                control->tbl_byte_sum = __calc_hdr_byte_sum(control);
 277                control->next_addr = EEPROM_RECORD_START;
 278
 279                DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
 280                                 control->num_recs);
 281
 282        } else {
 283                DRM_INFO("Creating new EEPROM table");
 284
 285                ret = amdgpu_ras_eeprom_reset_table(control);
 286        }
 287
 288        return ret == 1 ? 0 : -EIO;
 289}
 290
 291static void __encode_table_record_to_buff(struct amdgpu_ras_eeprom_control *control,
 292                                          struct eeprom_table_record *record,
 293                                          unsigned char *buff)
 294{
 295        __le64 tmp = 0;
 296        int i = 0;
 297
 298        /* Next are all record fields according to EEPROM page spec in LE foramt */
 299        buff[i++] = record->err_type;
 300
 301        buff[i++] = record->bank;
 302
 303        tmp = cpu_to_le64(record->ts);
 304        memcpy(buff + i, &tmp, 8);
 305        i += 8;
 306
 307        tmp = cpu_to_le64((record->offset & 0xffffffffffff));
 308        memcpy(buff + i, &tmp, 6);
 309        i += 6;
 310
 311        buff[i++] = record->mem_channel;
 312        buff[i++] = record->mcumc_id;
 313
 314        tmp = cpu_to_le64((record->retired_page & 0xffffffffffff));
 315        memcpy(buff + i, &tmp, 6);
 316}
 317
 318static void __decode_table_record_from_buff(struct amdgpu_ras_eeprom_control *control,
 319                                            struct eeprom_table_record *record,
 320                                            unsigned char *buff)
 321{
 322        __le64 tmp = 0;
 323        int i =  0;
 324
 325        /* Next are all record fields according to EEPROM page spec in LE foramt */
 326        record->err_type = buff[i++];
 327
 328        record->bank = buff[i++];
 329
 330        memcpy(&tmp, buff + i, 8);
 331        record->ts = le64_to_cpu(tmp);
 332        i += 8;
 333
 334        memcpy(&tmp, buff + i, 6);
 335        record->offset = (le64_to_cpu(tmp) & 0xffffffffffff);
 336        i += 6;
 337
 338        record->mem_channel = buff[i++];
 339        record->mcumc_id = buff[i++];
 340
 341        memcpy(&tmp, buff + i,  6);
 342        record->retired_page = (le64_to_cpu(tmp) & 0xffffffffffff);
 343}
 344
 345/*
 346 * When reaching end of EEPROM memory jump back to 0 record address
 347 * When next record access will go beyond EEPROM page boundary modify bits A17/A8
 348 * in I2C selector to go to next page
 349 */
 350static uint32_t __correct_eeprom_dest_address(uint32_t curr_address)
 351{
 352        uint32_t next_address = curr_address + EEPROM_TABLE_RECORD_SIZE;
 353
 354        /* When all EEPROM memory used jump back to 0 address */
 355        if (next_address > EEPROM_SIZE_BYTES) {
 356                DRM_INFO("Reached end of EEPROM memory, jumping to 0 "
 357                         "and overriding old record");
 358                return EEPROM_RECORD_START;
 359        }
 360
 361        /*
 362         * To check if we overflow page boundary  compare next address with
 363         * current and see if bits 17/8 of the EEPROM address will change
 364         * If they do start from the next 256b page
 365         *
 366         * https://www.st.com/resource/en/datasheet/m24m02-dr.pdf sec. 5.1.2
 367         */
 368        if ((curr_address & EEPROM_ADDR_MSB_MASK) != (next_address & EEPROM_ADDR_MSB_MASK)) {
 369                DRM_DEBUG_DRIVER("Reached end of EEPROM memory page, jumping to next: %lx",
 370                                (next_address & EEPROM_ADDR_MSB_MASK));
 371
 372                return  (next_address & EEPROM_ADDR_MSB_MASK);
 373        }
 374
 375        return curr_address;
 376}
 377
 378int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
 379                                            struct eeprom_table_record *records,
 380                                            bool write,
 381                                            int num)
 382{
 383        int i, ret = 0;
 384        struct i2c_msg *msgs, *msg;
 385        unsigned char *buffs, *buff;
 386        struct eeprom_table_record *record;
 387        struct amdgpu_device *adev = to_amdgpu_device(control);
 388
 389        if (adev->asic_type != CHIP_VEGA20 && adev->asic_type != CHIP_ARCTURUS)
 390                return 0;
 391
 392        buffs = kcalloc(num, EEPROM_ADDRESS_SIZE + EEPROM_TABLE_RECORD_SIZE,
 393                         GFP_KERNEL);
 394        if (!buffs)
 395                return -ENOMEM;
 396
 397        mutex_lock(&control->tbl_mutex);
 398
 399        msgs = kcalloc(num, sizeof(*msgs), GFP_KERNEL);
 400        if (!msgs) {
 401                ret = -ENOMEM;
 402                goto free_buff;
 403        }
 404
 405        /* In case of overflow just start from beginning to not lose newest records */
 406        if (write && (control->next_addr + EEPROM_TABLE_RECORD_SIZE * num > EEPROM_SIZE_BYTES))
 407                control->next_addr = EEPROM_RECORD_START;
 408
 409
 410        /*
 411         * TODO Currently makes EEPROM writes for each record, this creates
 412         * internal fragmentation. Optimized the code to do full page write of
 413         * 256b
 414         */
 415        for (i = 0; i < num; i++) {
 416                buff = &buffs[i * (EEPROM_ADDRESS_SIZE + EEPROM_TABLE_RECORD_SIZE)];
 417                record = &records[i];
 418                msg = &msgs[i];
 419
 420                control->next_addr = __correct_eeprom_dest_address(control->next_addr);
 421
 422                /*
 423                 * Update bits 16,17 of EEPROM address in I2C address by setting them
 424                 * to bits 1,2 of Device address byte
 425                 */
 426                msg->addr = control->i2c_address |
 427                                ((control->next_addr & EEPROM_ADDR_MSB_MASK) >> 15);
 428                msg->flags      = write ? 0 : I2C_M_RD;
 429                msg->len        = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_RECORD_SIZE;
 430                msg->buf        = buff;
 431
 432                /* Insert the EEPROM dest addess, bits 0-15 */
 433                buff[0] = ((control->next_addr >> 8) & 0xff);
 434                buff[1] = (control->next_addr & 0xff);
 435
 436                /* EEPROM table content is stored in LE format */
 437                if (write)
 438                        __encode_table_record_to_buff(control, record, buff + EEPROM_ADDRESS_SIZE);
 439
 440                /*
 441                 * The destination EEPROM address might need to be corrected to account
 442                 * for page or entire memory wrapping
 443                 */
 444                control->next_addr += EEPROM_TABLE_RECORD_SIZE;
 445        }
 446
 447        ret = i2c_transfer(&adev->pm.smu_i2c, msgs, num);
 448        if (ret < 1) {
 449                DRM_ERROR("Failed to process EEPROM table records, ret:%d", ret);
 450
 451                /* TODO Restore prev next EEPROM address ? */
 452                goto free_msgs;
 453        }
 454
 455
 456        if (!write) {
 457                for (i = 0; i < num; i++) {
 458                        buff = &buffs[i*(EEPROM_ADDRESS_SIZE + EEPROM_TABLE_RECORD_SIZE)];
 459                        record = &records[i];
 460
 461                        __decode_table_record_from_buff(control, record, buff + EEPROM_ADDRESS_SIZE);
 462                }
 463        }
 464
 465        if (write) {
 466                uint32_t old_hdr_byte_sum = __calc_hdr_byte_sum(control);
 467
 468                /*
 469                 * Update table header with size and CRC and account for table
 470                 * wrap around where the assumption is that we treat it as empty
 471                 * table
 472                 *
 473                 * TODO - Check the assumption is correct
 474                 */
 475                control->num_recs += num;
 476                control->num_recs %= EEPROM_MAX_RECORD_NUM;
 477                control->tbl_hdr.tbl_size += EEPROM_TABLE_RECORD_SIZE * num;
 478                if (control->tbl_hdr.tbl_size > EEPROM_SIZE_BYTES)
 479                        control->tbl_hdr.tbl_size = EEPROM_TABLE_HEADER_SIZE +
 480                        control->num_recs * EEPROM_TABLE_RECORD_SIZE;
 481
 482                __update_tbl_checksum(control, records, num, old_hdr_byte_sum);
 483
 484                __update_table_header(control, buffs);
 485        } else if (!__validate_tbl_checksum(control, records, num)) {
 486                DRM_WARN("EEPROM Table checksum mismatch!");
 487                /* TODO Uncomment when EEPROM read/write is relliable */
 488                /* ret = -EIO; */
 489        }
 490
 491free_msgs:
 492        kfree(msgs);
 493
 494free_buff:
 495        kfree(buffs);
 496
 497        mutex_unlock(&control->tbl_mutex);
 498
 499        return ret == num ? 0 : -EIO;
 500}
 501
 502/* Used for testing if bugs encountered */
 503#if 0
 504void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control)
 505{
 506        int i;
 507        struct eeprom_table_record *recs = kcalloc(1, sizeof(*recs), GFP_KERNEL);
 508
 509        if (!recs)
 510                return;
 511
 512        for (i = 0; i < 1 ; i++) {
 513                recs[i].address = 0xdeadbeef;
 514                recs[i].retired_page = i;
 515        }
 516
 517        if (!amdgpu_ras_eeprom_process_recods(control, recs, true, 1)) {
 518
 519                memset(recs, 0, sizeof(*recs) * 1);
 520
 521                control->next_addr = EEPROM_RECORD_START;
 522
 523                if (!amdgpu_ras_eeprom_process_recods(control, recs, false, 1)) {
 524                        for (i = 0; i < 1; i++)
 525                                DRM_INFO("rec.address :0x%llx, rec.retired_page :%llu",
 526                                         recs[i].address, recs[i].retired_page);
 527                } else
 528                        DRM_ERROR("Failed in reading from table");
 529
 530        } else
 531                DRM_ERROR("Failed in writing to table");
 532}
 533#endif
 534