355 lines
9.9 KiB
C
355 lines
9.9 KiB
C
// SPDX-License-Identifier: MIT
|
|
/*
|
|
* Copyright © 2026 Intel Corporation
|
|
*/
|
|
|
|
#include <linux/module.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/netdevice.h>
|
|
#include <linux/xarray.h>
|
|
#include <net/genetlink.h>
|
|
|
|
#include <drm/drm_ras.h>
|
|
|
|
#include "drm_ras_nl.h"
|
|
|
|
/**
|
|
* DOC: DRM RAS Node Management
|
|
*
|
|
* This module provides the infrastructure to manage RAS (Reliability,
|
|
* Availability, and Serviceability) nodes for DRM drivers. Each
|
|
* DRM driver may register one or more RAS nodes, which represent
|
|
* logical components capable of reporting error counters and other
|
|
* reliability metrics.
|
|
*
|
|
* The nodes are stored in a global xarray `drm_ras_xa` to allow
|
|
* efficient lookup by ID. Nodes can be registered or unregistered
|
|
* dynamically at runtime.
|
|
*
|
|
* A Generic Netlink family `drm_ras` exposes two main operations to
|
|
* userspace:
|
|
*
|
|
* 1. LIST_NODES: Dump all currently registered RAS nodes.
|
|
* The user receives an array of node IDs, names, and types.
|
|
*
|
|
* 2. GET_ERROR_COUNTER: Get error counters of a given node.
|
|
* Userspace must provide Node ID, Error ID (Optional for specific counter).
|
|
* Returns all counters of a node if only Node ID is provided or specific
|
|
* error counters.
|
|
*
|
|
* Node registration:
|
|
*
|
|
* - drm_ras_node_register(): Registers a new node and assigns
|
|
* it a unique ID in the xarray.
|
|
* - drm_ras_node_unregister(): Removes a previously registered
|
|
* node from the xarray.
|
|
*
|
|
* Node type:
|
|
*
|
|
* - ERROR_COUNTER:
|
|
* + Currently, only error counters are supported.
|
|
* + The driver must implement the query_error_counter() callback to provide
|
|
* the name and the value of the error counter.
|
|
* + The driver must provide a error_counter_range.last value informing the
|
|
* last valid error ID.
|
|
* + The driver can provide a error_counter_range.first value informing the
|
|
* first valid error ID.
|
|
* + The error counters in the driver doesn't need to be contiguous, but the
|
|
* driver must return -ENOENT to the query_error_counter as an indication
|
|
* that the ID should be skipped and not listed in the netlink API.
|
|
*
|
|
* Netlink handlers:
|
|
*
|
|
* - drm_ras_nl_list_nodes_dumpit(): Implements the LIST_NODES
|
|
* operation, iterating over the xarray.
|
|
* - drm_ras_nl_get_error_counter_dumpit(): Implements the GET_ERROR_COUNTER dumpit
|
|
* operation, fetching all counters from a specific node.
|
|
* - drm_ras_nl_get_error_counter_doit(): Implements the GET_ERROR_COUNTER doit
|
|
* operation, fetching a counter value from a specific node.
|
|
*/
|
|
|
|
static DEFINE_XARRAY_ALLOC(drm_ras_xa);
|
|
|
|
/*
|
|
* The netlink callback context carries dump state across multiple dumpit calls
|
|
*/
|
|
struct drm_ras_ctx {
|
|
/* Which xarray id to restart the dump from */
|
|
unsigned long restart;
|
|
};
|
|
|
|
/**
|
|
* drm_ras_nl_list_nodes_dumpit() - Dump all registered RAS nodes
|
|
* @skb: Netlink message buffer
|
|
* @cb: Callback context for multi-part dumps
|
|
*
|
|
* Iterates over all registered RAS nodes in the global xarray and appends
|
|
* their attributes (ID, name, type) to the given netlink message buffer.
|
|
* Uses @cb->ctx to track progress in case the message buffer fills up, allowing
|
|
* multi-part dump support. On buffer overflow, updates the context to resume
|
|
* from the last node on the next invocation.
|
|
*
|
|
* Return: 0 if all nodes fit in @skb, number of bytes added to @skb if
|
|
* the buffer filled up (requires multi-part continuation), or
|
|
* a negative error code on failure.
|
|
*/
|
|
int drm_ras_nl_list_nodes_dumpit(struct sk_buff *skb,
|
|
struct netlink_callback *cb)
|
|
{
|
|
const struct genl_info *info = genl_info_dump(cb);
|
|
struct drm_ras_ctx *ctx = (void *)cb->ctx;
|
|
struct drm_ras_node *node;
|
|
struct nlattr *hdr;
|
|
unsigned long id;
|
|
int ret;
|
|
|
|
xa_for_each_start(&drm_ras_xa, id, node, ctx->restart) {
|
|
hdr = genlmsg_iput(skb, info);
|
|
if (!hdr) {
|
|
ret = -EMSGSIZE;
|
|
break;
|
|
}
|
|
|
|
ret = nla_put_u32(skb, DRM_RAS_A_NODE_ATTRS_NODE_ID, node->id);
|
|
if (ret) {
|
|
genlmsg_cancel(skb, hdr);
|
|
break;
|
|
}
|
|
|
|
ret = nla_put_string(skb, DRM_RAS_A_NODE_ATTRS_DEVICE_NAME,
|
|
node->device_name);
|
|
if (ret) {
|
|
genlmsg_cancel(skb, hdr);
|
|
break;
|
|
}
|
|
|
|
ret = nla_put_string(skb, DRM_RAS_A_NODE_ATTRS_NODE_NAME,
|
|
node->node_name);
|
|
if (ret) {
|
|
genlmsg_cancel(skb, hdr);
|
|
break;
|
|
}
|
|
|
|
ret = nla_put_u32(skb, DRM_RAS_A_NODE_ATTRS_NODE_TYPE,
|
|
node->type);
|
|
if (ret) {
|
|
genlmsg_cancel(skb, hdr);
|
|
break;
|
|
}
|
|
|
|
genlmsg_end(skb, hdr);
|
|
}
|
|
|
|
if (ret == -EMSGSIZE)
|
|
ctx->restart = id;
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int get_node_error_counter(u32 node_id, u32 error_id,
|
|
const char **name, u32 *value)
|
|
{
|
|
struct drm_ras_node *node;
|
|
|
|
node = xa_load(&drm_ras_xa, node_id);
|
|
if (!node || !node->query_error_counter)
|
|
return -ENOENT;
|
|
|
|
if (error_id < node->error_counter_range.first ||
|
|
error_id > node->error_counter_range.last)
|
|
return -EINVAL;
|
|
|
|
return node->query_error_counter(node, error_id, name, value);
|
|
}
|
|
|
|
static int msg_reply_value(struct sk_buff *msg, u32 error_id,
|
|
const char *error_name, u32 value)
|
|
{
|
|
int ret;
|
|
|
|
ret = nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, error_id);
|
|
if (ret)
|
|
return ret;
|
|
|
|
ret = nla_put_string(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_NAME,
|
|
error_name);
|
|
if (ret)
|
|
return ret;
|
|
|
|
return nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_VALUE,
|
|
value);
|
|
}
|
|
|
|
static int doit_reply_value(struct genl_info *info, u32 node_id,
|
|
u32 error_id)
|
|
{
|
|
struct sk_buff *msg;
|
|
struct nlattr *hdr;
|
|
const char *error_name;
|
|
u32 value;
|
|
int ret;
|
|
|
|
msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
|
|
if (!msg)
|
|
return -ENOMEM;
|
|
|
|
hdr = genlmsg_iput(msg, info);
|
|
if (!hdr) {
|
|
nlmsg_free(msg);
|
|
return -EMSGSIZE;
|
|
}
|
|
|
|
ret = get_node_error_counter(node_id, error_id,
|
|
&error_name, &value);
|
|
if (ret)
|
|
return ret;
|
|
|
|
ret = msg_reply_value(msg, error_id, error_name, value);
|
|
if (ret) {
|
|
genlmsg_cancel(msg, hdr);
|
|
nlmsg_free(msg);
|
|
return ret;
|
|
}
|
|
|
|
genlmsg_end(msg, hdr);
|
|
|
|
return genlmsg_reply(msg, info);
|
|
}
|
|
|
|
/**
|
|
* drm_ras_nl_get_error_counter_dumpit() - Dump all Error Counters
|
|
* @skb: Netlink message buffer
|
|
* @cb: Callback context for multi-part dumps
|
|
*
|
|
* Iterates over all error counters in a given Node and appends
|
|
* their attributes (ID, name, value) to the given netlink message buffer.
|
|
* Uses @cb->ctx to track progress in case the message buffer fills up, allowing
|
|
* multi-part dump support. On buffer overflow, updates the context to resume
|
|
* from the last node on the next invocation.
|
|
*
|
|
* Return: 0 if all errors fit in @skb, number of bytes added to @skb if
|
|
* the buffer filled up (requires multi-part continuation), or
|
|
* a negative error code on failure.
|
|
*/
|
|
int drm_ras_nl_get_error_counter_dumpit(struct sk_buff *skb,
|
|
struct netlink_callback *cb)
|
|
{
|
|
const struct genl_info *info = genl_info_dump(cb);
|
|
struct drm_ras_ctx *ctx = (void *)cb->ctx;
|
|
struct drm_ras_node *node;
|
|
struct nlattr *hdr;
|
|
const char *error_name;
|
|
u32 node_id, error_id, value;
|
|
int ret;
|
|
|
|
if (!info->attrs || GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID))
|
|
return -EINVAL;
|
|
|
|
node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]);
|
|
|
|
node = xa_load(&drm_ras_xa, node_id);
|
|
if (!node)
|
|
return -ENOENT;
|
|
|
|
for (error_id = max(node->error_counter_range.first, ctx->restart);
|
|
error_id <= node->error_counter_range.last;
|
|
error_id++) {
|
|
ret = get_node_error_counter(node_id, error_id,
|
|
&error_name, &value);
|
|
/*
|
|
* For non-contiguous range, driver return -ENOENT as indication
|
|
* to skip this ID when listing all errors.
|
|
*/
|
|
if (ret == -ENOENT)
|
|
continue;
|
|
if (ret)
|
|
return ret;
|
|
|
|
hdr = genlmsg_iput(skb, info);
|
|
|
|
if (!hdr) {
|
|
ret = -EMSGSIZE;
|
|
break;
|
|
}
|
|
|
|
ret = msg_reply_value(skb, error_id, error_name, value);
|
|
if (ret) {
|
|
genlmsg_cancel(skb, hdr);
|
|
break;
|
|
}
|
|
|
|
genlmsg_end(skb, hdr);
|
|
}
|
|
|
|
if (ret == -EMSGSIZE)
|
|
ctx->restart = error_id;
|
|
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* drm_ras_nl_get_error_counter_doit() - Query an error counter of an node
|
|
* @skb: Netlink message buffer
|
|
* @info: Generic Netlink info containing attributes of the request
|
|
*
|
|
* Extracts the node ID and error ID from the netlink attributes and
|
|
* retrieves the current value of the corresponding error counter. Sends the
|
|
* result back to the requesting user via the standard Genl reply.
|
|
*
|
|
* Return: 0 on success, or negative errno on failure.
|
|
*/
|
|
int drm_ras_nl_get_error_counter_doit(struct sk_buff *skb,
|
|
struct genl_info *info)
|
|
{
|
|
u32 node_id, error_id;
|
|
|
|
if (!info->attrs ||
|
|
GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID) ||
|
|
GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID))
|
|
return -EINVAL;
|
|
|
|
node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]);
|
|
error_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID]);
|
|
|
|
return doit_reply_value(info, node_id, error_id);
|
|
}
|
|
|
|
/**
|
|
* drm_ras_node_register() - Register a new RAS node
|
|
* @node: Node structure to register
|
|
*
|
|
* Adds the given RAS node to the global node xarray and assigns it
|
|
* a unique ID. Both @node->name and @node->type must be valid.
|
|
*
|
|
* Return: 0 on success, or negative errno on failure:
|
|
*/
|
|
int drm_ras_node_register(struct drm_ras_node *node)
|
|
{
|
|
if (!node->device_name || !node->node_name)
|
|
return -EINVAL;
|
|
|
|
/* Currently, only Error Counter Endpoints are supported */
|
|
if (node->type != DRM_RAS_NODE_TYPE_ERROR_COUNTER)
|
|
return -EINVAL;
|
|
|
|
/* Mandatory entries for Error Counter Node */
|
|
if (node->type == DRM_RAS_NODE_TYPE_ERROR_COUNTER &&
|
|
(!node->error_counter_range.last || !node->query_error_counter))
|
|
return -EINVAL;
|
|
|
|
return xa_alloc(&drm_ras_xa, &node->id, node, xa_limit_32b, GFP_KERNEL);
|
|
}
|
|
EXPORT_SYMBOL(drm_ras_node_register);
|
|
|
|
/**
|
|
* drm_ras_node_unregister() - Unregister a previously registered node
|
|
* @node: Node structure to unregister
|
|
*
|
|
* Removes the given node from the global node xarray using its ID.
|
|
*/
|
|
void drm_ras_node_unregister(struct drm_ras_node *node)
|
|
{
|
|
xa_erase(&drm_ras_xa, node->id);
|
|
}
|
|
EXPORT_SYMBOL(drm_ras_node_unregister);
|