mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-12-20 09:34:44 +08:00
drm/amdgpu: RAS EEPROM table is now in debugfs
Add "ras_eeprom_size" file in debugfs, which reports the maximum size allocated to the RAS table in EEROM, as the number of bytes and the number of records it could store. For instance, $cat /sys/kernel/debug/dri/0/ras/ras_eeprom_size 262144 bytes or 10921 records $_ Add "ras_eeprom_table" file in debugfs, which dumps the RAS table stored EEPROM, in a formatted way. For instance, $cat ras_eeprom_table Signature Version FirstOffs Size Checksum 0x414D4452 0x00010000 0x00000014 0x000000EC 0x000000DA Index Offset ErrType Bank/CU TimeStamp Offs/Addr MemChl MCUMCID RetiredPage 0 0x00014 ue 0x00 0x00000000607608DC 0x000000000000 0x00 0x00 0x000000000000 1 0x0002C ue 0x00 0x00000000607608DC 0x000000001000 0x00 0x00 0x000000000001 2 0x00044 ue 0x00 0x00000000607608DC 0x000000002000 0x00 0x00 0x000000000002 3 0x0005C ue 0x00 0x00000000607608DC 0x000000003000 0x00 0x00 0x000000000003 4 0x00074 ue 0x00 0x00000000607608DC 0x000000004000 0x00 0x00 0x000000000004 5 0x0008C ue 0x00 0x00000000607608DC 0x000000005000 0x00 0x00 0x000000000005 6 0x000A4 ue 0x00 0x00000000607608DC 0x000000006000 0x00 0x00 0x000000000006 7 0x000BC ue 0x00 0x00000000607608DC 0x000000007000 0x00 0x00 0x000000000007 8 0x000D4 ue 0x00 0x00000000607608DD 0x000000008000 0x00 0x00 0x000000000008 $_ Cc: Alexander Deucher <Alexander.Deucher@amd.com> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com> Cc: John Clements <john.clements@amd.com> Cc: Hawking Zhang <Hawking.Zhang@amd.com> Cc: Xinhui Pan <xinhui.pan@amd.com> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com> Acked-by: Alexander Deucher <Alexander.Deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
63d4c081a5
commit
c65b0805e7
@ -404,9 +404,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
|
||||
/* umc ce/ue error injection for a bad page is not allowed */
|
||||
if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
|
||||
amdgpu_ras_check_bad_page(adev, data.inject.address)) {
|
||||
dev_warn(adev->dev, "RAS WARN: 0x%llx has been marked "
|
||||
"as bad before error injection!\n",
|
||||
data.inject.address);
|
||||
dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
|
||||
"already been marked as bad!\n",
|
||||
data.inject.address);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -1301,6 +1301,12 @@ static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *
|
||||
&con->bad_page_cnt_threshold);
|
||||
debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled);
|
||||
debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled);
|
||||
debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev,
|
||||
&amdgpu_ras_debugfs_eeprom_size_ops);
|
||||
con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table",
|
||||
S_IRUGO, dir, adev,
|
||||
&amdgpu_ras_debugfs_eeprom_table_ops);
|
||||
amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control);
|
||||
|
||||
/*
|
||||
* After one uncorrectable error happens, usually GPU recovery will
|
||||
|
@ -318,6 +318,7 @@ struct amdgpu_ras {
|
||||
/* sysfs */
|
||||
struct device_attribute features_attr;
|
||||
struct bin_attribute badpages_attr;
|
||||
struct dentry *de_ras_eeprom_table;
|
||||
/* block array */
|
||||
struct ras_manager *objs;
|
||||
|
||||
|
@ -27,6 +27,8 @@
|
||||
#include <linux/bits.h>
|
||||
#include "atom.h"
|
||||
#include "amdgpu_eeprom.h"
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
#define EEPROM_I2C_MADDR_VEGA20 0x0
|
||||
#define EEPROM_I2C_MADDR_ARCTURUS 0x40000
|
||||
@ -70,6 +72,13 @@
|
||||
#define RAS_OFFSET_TO_INDEX(_C, _O) (((_O) - \
|
||||
(_C)->ras_record_offset) / RAS_TABLE_RECORD_SIZE)
|
||||
|
||||
/* Given a 0-based relative record index, 0, 1, 2, ..., etc., off
|
||||
* of "fri", return the absolute record index off of the end of
|
||||
* the table header.
|
||||
*/
|
||||
#define RAS_RI_TO_AI(_C, _I) (((_I) + (_C)->ras_fri) % \
|
||||
(_C)->ras_max_record_count)
|
||||
|
||||
#define RAS_NUM_RECS(_tbl_hdr) (((_tbl_hdr)->tbl_size - \
|
||||
RAS_TABLE_HEADER_SIZE) / RAS_TABLE_RECORD_SIZE)
|
||||
|
||||
@ -77,13 +86,10 @@
|
||||
|
||||
static bool __is_ras_eeprom_supported(struct amdgpu_device *adev)
|
||||
{
|
||||
if ((adev->asic_type == CHIP_VEGA20) ||
|
||||
(adev->asic_type == CHIP_ARCTURUS) ||
|
||||
(adev->asic_type == CHIP_SIENNA_CICHLID) ||
|
||||
(adev->asic_type == CHIP_ALDEBARAN))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
return adev->asic_type == CHIP_VEGA20 ||
|
||||
adev->asic_type == CHIP_ARCTURUS ||
|
||||
adev->asic_type == CHIP_SIENNA_CICHLID ||
|
||||
adev->asic_type == CHIP_ALDEBARAN;
|
||||
}
|
||||
|
||||
static bool __get_eeprom_i2c_addr_arct(struct amdgpu_device *adev,
|
||||
@ -258,6 +264,8 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
|
||||
control->ras_num_recs = 0;
|
||||
control->ras_fri = 0;
|
||||
|
||||
amdgpu_ras_debugfs_set_ret_size(control);
|
||||
|
||||
mutex_unlock(&control->ras_tbl_mutex);
|
||||
|
||||
return res;
|
||||
@ -591,6 +599,8 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
|
||||
res = amdgpu_ras_eeprom_append_table(control, record, num);
|
||||
if (!res)
|
||||
res = amdgpu_ras_eeprom_update_header(control);
|
||||
if (!res)
|
||||
amdgpu_ras_debugfs_set_ret_size(control);
|
||||
|
||||
mutex_unlock(&control->ras_tbl_mutex);
|
||||
return res;
|
||||
@ -734,6 +744,223 @@ inline uint32_t amdgpu_ras_eeprom_max_record_count(void)
|
||||
return RAS_MAX_RECORD_COUNT;
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
amdgpu_ras_debugfs_eeprom_size_read(struct file *f, char __user *buf,
|
||||
size_t size, loff_t *pos)
|
||||
{
|
||||
struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
|
||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||
struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
|
||||
u8 data[50];
|
||||
int res;
|
||||
|
||||
if (!size)
|
||||
return size;
|
||||
|
||||
if (!ras || !control) {
|
||||
res = snprintf(data, sizeof(data), "Not supported\n");
|
||||
} else {
|
||||
res = snprintf(data, sizeof(data), "%d bytes or %d records\n",
|
||||
RAS_TBL_SIZE_BYTES, control->ras_max_record_count);
|
||||
}
|
||||
|
||||
if (*pos >= res)
|
||||
return 0;
|
||||
|
||||
res -= *pos;
|
||||
res = min_t(size_t, res, size);
|
||||
|
||||
if (copy_to_user(buf, &data[*pos], res))
|
||||
return -EINVAL;
|
||||
|
||||
*pos += res;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops = {
|
||||
.owner = THIS_MODULE,
|
||||
.read = amdgpu_ras_debugfs_eeprom_size_read,
|
||||
.write = NULL,
|
||||
.llseek = default_llseek,
|
||||
};
|
||||
|
||||
static const char *tbl_hdr_str = " Signature Version FirstOffs Size Checksum\n";
|
||||
static const char *tbl_hdr_fmt = "0x%08X 0x%08X 0x%08X 0x%08X 0x%08X\n";
|
||||
#define tbl_hdr_fmt_size (5 * (2+8) + 4 + 1)
|
||||
static const char *rec_hdr_str = "Index Offset ErrType Bank/CU TimeStamp Offs/Addr MemChl MCUMCID RetiredPage\n";
|
||||
static const char *rec_hdr_fmt = "%5d 0x%05X %7s 0x%02X 0x%016llX 0x%012llX 0x%02X 0x%02X 0x%012llX\n";
|
||||
#define rec_hdr_fmt_size (5 + 1 + 7 + 1 + 7 + 1 + 7 + 1 + 18 + 1 + 14 + 1 + 6 + 1 + 7 + 1 + 14 + 1)
|
||||
|
||||
static const char *record_err_type_str[AMDGPU_RAS_EEPROM_ERR_COUNT] = {
|
||||
"ignore",
|
||||
"re",
|
||||
"ue",
|
||||
};
|
||||
|
||||
static loff_t amdgpu_ras_debugfs_table_size(struct amdgpu_ras_eeprom_control *control)
|
||||
{
|
||||
return strlen(tbl_hdr_str) + tbl_hdr_fmt_size +
|
||||
strlen(rec_hdr_str) + rec_hdr_fmt_size * control->ras_num_recs;
|
||||
}
|
||||
|
||||
void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control)
|
||||
{
|
||||
struct amdgpu_ras *ras = container_of(control, struct amdgpu_ras,
|
||||
eeprom_control);
|
||||
struct dentry *de = ras->de_ras_eeprom_table;
|
||||
|
||||
if (de)
|
||||
d_inode(de)->i_size = amdgpu_ras_debugfs_table_size(control);
|
||||
}
|
||||
|
||||
static ssize_t amdgpu_ras_debugfs_table_read(struct file *f, char __user *buf,
|
||||
size_t size, loff_t *pos)
|
||||
{
|
||||
struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
|
||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||
struct amdgpu_ras_eeprom_control *control = &ras->eeprom_control;
|
||||
const size_t orig_size = size;
|
||||
int res = -EINVAL;
|
||||
size_t data_len;
|
||||
|
||||
mutex_lock(&control->ras_tbl_mutex);
|
||||
|
||||
/* We want *pos - data_len > 0, which means there's
|
||||
* bytes to be printed from data.
|
||||
*/
|
||||
data_len = strlen(tbl_hdr_str);
|
||||
if (*pos < data_len) {
|
||||
data_len -= *pos;
|
||||
data_len = min_t(size_t, data_len, size);
|
||||
if (copy_to_user(buf, &tbl_hdr_str[*pos], data_len))
|
||||
goto Out;
|
||||
buf += data_len;
|
||||
size -= data_len;
|
||||
*pos += data_len;
|
||||
}
|
||||
|
||||
data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size;
|
||||
if (*pos < data_len && size > 0) {
|
||||
u8 data[tbl_hdr_fmt_size + 1];
|
||||
loff_t lpos;
|
||||
|
||||
snprintf(data, sizeof(data), tbl_hdr_fmt,
|
||||
control->tbl_hdr.header,
|
||||
control->tbl_hdr.version,
|
||||
control->tbl_hdr.first_rec_offset,
|
||||
control->tbl_hdr.tbl_size,
|
||||
control->tbl_hdr.checksum);
|
||||
|
||||
data_len -= *pos;
|
||||
data_len = min_t(size_t, data_len, size);
|
||||
lpos = *pos - strlen(tbl_hdr_str);
|
||||
if (copy_to_user(buf, &data[lpos], data_len))
|
||||
goto Out;
|
||||
buf += data_len;
|
||||
size -= data_len;
|
||||
*pos += data_len;
|
||||
}
|
||||
|
||||
data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size + strlen(rec_hdr_str);
|
||||
if (*pos < data_len && size > 0) {
|
||||
loff_t lpos;
|
||||
|
||||
data_len -= *pos;
|
||||
data_len = min_t(size_t, data_len, size);
|
||||
lpos = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size;
|
||||
if (copy_to_user(buf, &rec_hdr_str[lpos], data_len))
|
||||
goto Out;
|
||||
buf += data_len;
|
||||
size -= data_len;
|
||||
*pos += data_len;
|
||||
}
|
||||
|
||||
data_len = amdgpu_ras_debugfs_table_size(control);
|
||||
if (*pos < data_len && size > 0) {
|
||||
u8 dare[RAS_TABLE_RECORD_SIZE];
|
||||
u8 data[rec_hdr_fmt_size + 1];
|
||||
/* Find the starting record index
|
||||
*/
|
||||
int s = (*pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size -
|
||||
strlen(rec_hdr_str)) / rec_hdr_fmt_size;
|
||||
int r = (*pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size -
|
||||
strlen(rec_hdr_str)) % rec_hdr_fmt_size;
|
||||
struct eeprom_table_record record;
|
||||
|
||||
for ( ; size > 0 && s < control->ras_num_recs; s++) {
|
||||
u32 ai = RAS_RI_TO_AI(control, s);
|
||||
/* Read a single record
|
||||
*/
|
||||
res = __amdgpu_ras_eeprom_read(control, dare, ai, 1);
|
||||
if (res)
|
||||
goto Out;
|
||||
__decode_table_record_from_buf(control, &record, dare);
|
||||
snprintf(data, sizeof(data), rec_hdr_fmt,
|
||||
s,
|
||||
RAS_INDEX_TO_OFFSET(control, ai),
|
||||
record_err_type_str[record.err_type],
|
||||
record.bank,
|
||||
record.ts,
|
||||
record.offset,
|
||||
record.mem_channel,
|
||||
record.mcumc_id,
|
||||
record.retired_page);
|
||||
|
||||
data_len = min_t(size_t, rec_hdr_fmt_size - r, size);
|
||||
if (copy_to_user(buf, &data[r], data_len))
|
||||
return -EINVAL;
|
||||
buf += data_len;
|
||||
size -= data_len;
|
||||
*pos += data_len;
|
||||
r = 0;
|
||||
}
|
||||
}
|
||||
res = 0;
|
||||
Out:
|
||||
mutex_unlock(&control->ras_tbl_mutex);
|
||||
return res < 0 ? res : orig_size - size;
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
amdgpu_ras_debugfs_eeprom_table_read(struct file *f, char __user *buf,
|
||||
size_t size, loff_t *pos)
|
||||
{
|
||||
struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
|
||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||
struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
|
||||
u8 data[81];
|
||||
int res;
|
||||
|
||||
if (!size)
|
||||
return size;
|
||||
|
||||
if (!ras || !control) {
|
||||
res = snprintf(data, sizeof(data), "Not supported\n");
|
||||
if (*pos >= res)
|
||||
return 0;
|
||||
|
||||
res -= *pos;
|
||||
res = min_t(size_t, res, size);
|
||||
|
||||
if (copy_to_user(buf, &data[*pos], res))
|
||||
return -EINVAL;
|
||||
|
||||
*pos += res;
|
||||
|
||||
return res;
|
||||
} else {
|
||||
return amdgpu_ras_debugfs_table_read(f, buf, size, pos);
|
||||
}
|
||||
}
|
||||
|
||||
const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops = {
|
||||
.owner = THIS_MODULE,
|
||||
.read = amdgpu_ras_debugfs_eeprom_table_read,
|
||||
.write = NULL,
|
||||
.llseek = default_llseek,
|
||||
};
|
||||
|
||||
/**
|
||||
* __verify_ras_table_checksum -- verify the RAS EEPROM table checksum
|
||||
* @control: pointer to control structure
|
||||
|
@ -29,9 +29,10 @@
|
||||
struct amdgpu_device;
|
||||
|
||||
enum amdgpu_ras_eeprom_err_type {
|
||||
AMDGPU_RAS_EEPROM_ERR_PLACE_HOLDER,
|
||||
AMDGPU_RAS_EEPROM_ERR_NA,
|
||||
AMDGPU_RAS_EEPROM_ERR_RECOVERABLE,
|
||||
AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE
|
||||
AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE,
|
||||
AMDGPU_RAS_EEPROM_ERR_COUNT,
|
||||
};
|
||||
|
||||
struct amdgpu_ras_eeprom_table_header {
|
||||
@ -121,4 +122,9 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
|
||||
|
||||
inline uint32_t amdgpu_ras_eeprom_max_record_count(void);
|
||||
|
||||
void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control);
|
||||
|
||||
extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
|
||||
extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops;
|
||||
|
||||
#endif // _AMDGPU_RAS_EEPROM_H
|
||||
|
Loading…
Reference in New Issue
Block a user