Linux存储IO栈

waston 2019-03-08

展开全文

本系列文章将自底向上分析Linux存储IO栈源码（基于4.4.19），为学习Linux存储做记录。具体目录如下：

一、 Linux内核对象与对象集二、 sysfs 三、设备模型四、 SCSI子系统

五、 SCSI磁盘驱动sd

六、 SCSI Target--TCM

七、用户空间IO--UIO

八、在用户空间实现虚拟SCSI磁盘--TCMU

九、通用块层

十、文件系统--VFS

Linux内核对象和对象集

内核对象作为Linux设备驱动模型的基础，主要是抽象和封装总线、设备、驱动、类和接口之间的关系具体实现的相关代码，并在sysfs中呈现。主要抽象成kobject和kset结构：

struct kobject {
    const char      *name;   //在sysfs中显示的名称
    struct list_head    entry;   //链入kset的kobj链表
    struct kobject      *parent; //指向父kobject，用于表示树形结构
    struct kset     *kset;   //指向链入的kset
    struct kobj_type    *ktype;  //抽象kobject的通用方法和属性
    struct kernfs_node  *sd;     //sysfs directory entry 
    struct kref     kref;    //引用计数 
#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
    struct delayed_work release; 
#endif
    unsigned int state_initialized:1;  //是否被初始化
    unsigned int state_in_sysfs:1;     //是否被添加到sysfs
    unsigned int state_add_uevent_sent:1; //是否发送ADD事件到用户空间
    unsigned int state_remove_uevent_sent:1; //是否发送REMOVE事件到用户空间
    unsigned int uevent_suppress:1; //事件是否被抑制
};

在kobject结构中ktype域是对kobject一些通用方法和属性进行封装：

struct kobj_type {
    void (*release)(struct kobject *kobj); //释放kobject结构时回调
    const struct sysfs_ops *sysfs_ops; //sysfs的操作函数
    struct attribute **default_attrs;  //默认属性
        //命名空间相关操作
    const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj);
    const void *(*namespace)(struct kobject *kobj);
};

kset是一组kobject的集合，通过kset可以遍历这组kobject，如SCSI子系统中，设备是一种kobject，通过设备集kset，可以遍历所有的设备。

/**
 * struct kset - a set of kobjects of a specific type, belonging to a specific subsystem.
 *
 * A kset defines a group of kobjects.  They can be individually
 * different "types" but overall these kobjects all want to be grouped
 * together and operated on in the same manner.  ksets are used to
 * define the attribute callbacks and other common events that happen to
 * a kobject.
 *
 * @list: the list of all kobjects for this kset
 * @list_lock: a lock for iterating over the kobjects
 * @kobj: the embedded kobject for this kset (recursion, isn't it fun...)
 * @uevent_ops: the set of uevent operations for this kset.  These are
 * called whenever a kobject has something happen to it so that the kset
 * can add new environment variables, or filter out the uevents if so
 * desired.
 */
struct kset {
    struct list_head list; //链入kset的kobject链表
    spinlock_t list_lock;  //遍历链表是的自旋锁struct kobject kobj;   //本身可以当做kobject对待
    const struct kset_uevent_ops *uevent_ops; //发送uevent事件的回调函数
};

在发送事件到用户空间时，可以回调kset_uevent_ops中的3个回调函数

struct kset_uevent_ops {
    int (* const filter)(struct kset *kset, struct kobject *kobj);
    const char *(* const name)(struct kset *kset, struct kobject *kobj);
    int (* const uevent)(struct kset *kset, struct kobject *kobj,
              struct kobj_uevent_env *env);
};

filter：在发送事件之前的过滤某些事件。
name: 获取名称。
uevent：设置uevent需要的环境变量。

内核对象关系

内核对象相关操作

void kobject_init(struct kobject *kobj, struct kobj_type *ktype);
int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...);
int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype, struct kobject *parent, const char *fmt, ...);
void kobject_del(struct kobject *kobj);
struct kobject *  kobject_create(void);
struct kobject * kobject_create_and_add(const char *name, struct kobject *parent);
int kobject_rename(struct kobject *, const char *new_name);
int kobject_move(struct kobject *, struct kobject *);
struct kobject *kobject_get(struct kobject *kobj);
void kobject_put(struct kobject *kobj);
const void *kobject_namespace(struct kobject *kobj);
char *kobject_get_path(struct kobject *kobj, gfp_t flag);

内核对象创建及初始化

初始化流程主要在kobject_init:

/**
 * kobject_init - initialize a kobject structure
 * @kobj: pointer to the kobject to initialize
 * @ktype: pointer to the ktype for this kobject.
 *
 * This function will properly initialize a kobject such that it can then
 * be passed to the kobject_add() call.
 *
 * After this function is called, the kobject MUST be cleaned up by a call
 * to kobject_put(), not by a call to kfree directly to ensure that all of
 * the memory is cleaned up properly.
 */
void kobject_init(struct kobject *kobj, struct kobj_type *ktype)
{
    char *err_str;

    if (!kobj) {
        err_str = "invalid kobject pointer!";
        goto error;
    }
    if (!ktype) {
        err_str = "must have a ktype to be initialized properly!\n";
        goto error;
    }
    if (kobj->state_initialized) {  //避免重复初始化
        /* do not error out as sometimes we can recover */
        printk(KERN_ERR "kobject (%p): tried to init an initialized "
               "object, something is seriously wrong.\n", kobj);
        dump_stack();
    }

    kobject_init_internal(kobj); //完成初始化的主要函数
    kobj->ktype = ktype;
    return;

error:
    printk(KERN_ERR "kobject (%p): %s\n", kobj, err_str);
    dump_stack();
}
EXPORT_SYMBOL(kobject_init);

由上面函数可以看出由kobject_init_internal完成初始化：

static void kobject_init_internal(struct kobject *kobj)
{
    if (!kobj)
        return;
    kref_init(&kobj->kref);
    INIT_LIST_HEAD(&kobj->entry);
    kobj->state_in_sysfs = 0;
    kobj->state_add_uevent_sent = 0;
    kobj->state_remove_uevent_sent = 0;
    kobj->state_initialized = 1;
}

kobject_create函数仅仅是在调用kobject_init之前，先分配kobject空间。在kobject初始化之后，需要调用kobject_add将kobject添加到sysfs中。

/**
 * kobject_add - the main kobject add function
 * @kobj: the kobject to add
 * @parent: pointer to the parent of the kobject.
 * @fmt: format to name the kobject with.
 *
 * The kobject name is set and added to the kobject hierarchy in this
 * function.
 *
 * If @parent is set, then the parent of the @kobj will be set to it.
 * If @parent is NULL, then the parent of the @kobj will be set to the
 * kobject associated with the kset assigned to this kobject.  If no kset
 * is assigned to the kobject, then the kobject will be located in the
 * root of the sysfs tree.
 *
 * If this function returns an error, kobject_put() must be called to
 * properly clean up the memory associated with the object.
 * Under no instance should the kobject that is passed to this function
 * be directly freed with a call to kfree(), that can leak memory.
 *
 * Note, no "add" uevent will be created with this call, the caller should set
 * up all of the necessary sysfs files for the object and then call
 * kobject_uevent() with the UEVENT_ADD parameter to ensure that
 * userspace is properly notified of this kobject's creation.
 */
int kobject_add(struct kobject *kobj, struct kobject *parent,
        const char *fmt, ...)
{
    va_list args;
    int retval;

    if (!kobj)
        return -EINVAL;

    if (!kobj->state_initialized) { //add之前需要初始化
        printk(KERN_ERR "kobject '%s' (%p): tried to add an "
               "uninitialized object, something is seriously wrong.\n",
               kobject_name(kobj), kobj);
        dump_stack();
        return -EINVAL;
    }
    va_start(args, fmt);
    retval = kobject_add_varg(kobj, parent, fmt, args); //主要完成add操作
    va_end(args);

    return retval;
}
EXPORT_SYMBOL(kobject_add);

kobject_add_varg/kobject_add_internal主要完成将kobject添加到sysfs的操作：

static __printf(3, 0) int kobject_add_varg(struct kobject *kobj,
                       struct kobject *parent,
                       const char *fmt, va_list vargs)
{
    int retval;
        //设置kobject在sysfs中显示的名称
    retval = kobject_set_name_vargs(kobj, fmt, vargs);
    if (retval) {
        printk(KERN_ERR "kobject: can not set name properly!\n");
        return retval;
    }
    kobj->parent = parent;
    return kobject_add_internal(kobj); //主要实现函数
}

static int kobject_add_internal(struct kobject *kobj)
{
    int error = 0;
    struct kobject *parent;

    if (!kobj)
        return -ENOENT;

    if (!kobj->name || !kobj->name[0]) {
        WARN(1, "kobject: (%p): attempted to be registered with empty "
             "name!\n", kobj);
        return -EINVAL;
    }

    parent = kobject_get(kobj->parent); //增加父对象的引用计数

    /* join kset if set, use it as parent if we do not already have one */
    if (kobj->kset) { //如果设置了kset，而没有设置parent，则把kset的kobject设置为parent
        if (!parent)
            parent = kobject_get(&kobj->kset->kobj);
        kobj_kset_join(kobj);
        kobj->parent = parent;
    }

    pr_debug("kobject: '%s' (%p): %s: parent: '%s', set: '%s'\n",
         kobject_name(kobj), kobj, __func__,
         parent ? kobject_name(parent) : "<NULL>",
         kobj->kset ? kobject_name(&kobj->kset->kobj) : "<NULL>");

    error = create_dir(kobj);  //创建sysfs对应的目录和属性文件
    if (error) {  //出错回滚
        kobj_kset_leave(kobj);
        kobject_put(parent);
        kobj->parent = NULL;

        /* be noisy on error issues */
        if (error == -EEXIST)
            WARN(1, "%s failed for %s with "
                 "-EEXIST, don't try to register things with "
                 "the same name in the same directory.\n",
                 __func__, kobject_name(kobj));
        else
            WARN(1, "%s failed for %s (error: %d parent: %s)\n",
                 __func__, kobject_name(kobj), error,
                 parent ? kobject_name(parent) : "'none'");
    } else
        kobj->state_in_sysfs = 1; //更新标志位

    return error;
}

由create_dir在sysfs创建真实的目录和文件，这点有下一篇sysfs详细描述。理解了kobject_init和kobject_add之后，由名字可以知道下面函数kobject_init_and_add和kobject_create_and_add

内核对象释放

调用kobject_del将对kobject释放：

/**
 * kobject_del - unlink kobject from hierarchy.
 * @kobj: object.
 */
void kobject_del(struct kobject *kobj)
{
    struct kernfs_node *sd;

    if (!kobj)
        return;

    sd = kobj->sd;
    sysfs_remove_dir(kobj); //删除kobject在sysfs中的目录
    sysfs_put(sd);

    kobj->state_in_sysfs = 0; //设置标志位
    kobj_kset_leave(kobj);  //kobject脱离kset链表
    kobject_put(kobj->parent); //调用kobject_release释放
    kobj->parent = NULL;
}
EXPORT_SYMBOL(kobject_del);

/**
 * kobject_put - decrement refcount for object.
 * @kobj: object.
 *
 * Decrement the refcount, and if 0, call kobject_cleanup().
 */
void kobject_put(struct kobject *kobj)
{
    if (kobj) {
        if (!kobj->state_initialized)
            WARN(1, KERN_WARNING "kobject: '%s' (%p): is not "
                   "initialized, yet kobject_put() is being "
                   "called.\n", kobject_name(kobj), kobj);
        kref_put(&kobj->kref, kobject_release);  //调用kobject_release
    }
}
EXPORT_SYMBOL(kobject_put);

static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
{
    return kref_sub(kref, 1, release);
}

static inline int kref_sub(struct kref *kref, unsigned int count,
         void (*release)(struct kref *kref))
{
    WARN_ON(release == NULL);

    if (atomic_sub_and_test((int) count, &kref->refcount)) {
        release(kref); //调用kobject_release
        return 1;
    }
    return 0;
}

根据上面的代码追踪，得知kobject_release才是释放kobject的主角：

static void kobject_release(struct kref *kref)
{
    struct kobject *kobj = container_of(kref, struct kobject, kref);
#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
    unsigned long delay = HZ + HZ * (get_random_int() & 0x3);
    pr_info("kobject: '%s' (%p): %s, parent %p (delayed %ld)\n",
         kobject_name(kobj), kobj, __func__, kobj->parent, delay);
    INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup);
    //延迟调用kobject_delayed_cleanup进行清理
    schedule_delayed_work(&kobj->release, delay);
#else
    kobject_cleanup(kobj);  //清理
#endif
}

如果在内核编译时指定CONFIG_DEBUG_KOBJECT_RELEASE，则使用延迟release方式调用kobject_delayed_cleanup，否则直接调用kobject_cleanup。

#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
static void kobject_delayed_cleanup(struct work_struct *work)
{
    kobject_cleanup(container_of(to_delayed_work(work), //最终还是调用
                     struct kobject, release));
}
#endif

/*
 * kobject_cleanup - free kobject resources.
 * @kobj: object to cleanup
 */
static void kobject_cleanup(struct kobject *kobj)
{
    struct kobj_type *t = get_ktype(kobj);
    const char *name = kobj->name;

    pr_debug("kobject: '%s' (%p): %s, parent %p\n",
         kobject_name(kobj), kobj, __func__, kobj->parent);

    if (t && !t->release)
        pr_debug("kobject: '%s' (%p): does not have a release() "
             "function, it is broken and must be fixed.\n",
             kobject_name(kobj), kobj);

    /* send "remove" if the caller did not do it but sent "add" */
    if (kobj->state_add_uevent_sent && !kobj->state_remove_uevent_sent) {
        pr_debug("kobject: '%s' (%p): auto cleanup 'remove' event\n",
             kobject_name(kobj), kobj);
        kobject_uevent(kobj, KOBJ_REMOVE); //仅仅发送一次REMOVE消息
    }

    /* remove from sysfs if the caller did not do it */
    if (kobj->state_in_sysfs) {
        pr_debug("kobject: '%s' (%p): auto cleanup kobject_del\n",
             kobject_name(kobj), kobj);
        kobject_del(kobj); //如果调用者没有清理sysfs，则清理
    }

    if (t && t->release) {
        pr_debug("kobject: '%s' (%p): calling ktype release\n",
             kobject_name(kobj), kobj);
        t->release(kobj); //调用kobj_type的release回调函数
    }

    /* free name if we allocated it */
    if (name) {
        pr_debug("kobject: '%s': free name\n", name);
        kfree_const(name);
    }
}

内核对象集相关操作

void kset_init(struct kset *kset);
struct kset *kset_create(const char *name, const struct kset_uevent_ops *uevent_ops, struct kobject *parent_kobj);
int kset_register(struct kset *kset);
void kset_unregister(struct kset *kset);
struct kset * kset_create_and_add(const char *name, const struct kset_uevent_ops *u, struct kobject *parent_kobj);

内核对象集创建及初始化

内核对象集由kset_create创建

/**
 * kset_create - create a struct kset dynamically
 *
 * @name: the name for the kset
 * @uevent_ops: a struct kset_uevent_ops for the kset
 * @parent_kobj: the parent kobject of this kset, if any.
 *
 * This function creates a kset structure dynamically.  This structure can
 * then be registered with the system and show up in sysfs with a call to
 * kset_register().  When you are finished with this structure, if
 * kset_register() has been called, call kset_unregister() and the
 * structure will be dynamically freed when it is no longer being used.
 *
 * If the kset was not able to be created, NULL will be returned.
 */
static struct kset *kset_create(const char *name,
                const struct kset_uevent_ops *uevent_ops,
                struct kobject *parent_kobj)
{
    struct kset *kset;
    int retval;

    kset = kzalloc(sizeof(*kset), GFP_KERNEL);  //分配空间
    if (!kset)
        return NULL;
    retval = kobject_set_name(&kset->kobj, "%s", name); //设置kset在sysfs中的名字
    if (retval) {
        kfree(kset);
        return NULL;
    }
    kset->uevent_ops = uevent_ops;   //设置uevent_ops
    kset->kobj.parent = parent_kobj; //设置kset的父对象

    /*
     * The kobject of this kset will have a type of kset_ktype and belong to
     * no kset itself.  That way we can properly free it when it is
     * finished being used.
     */
    kset->kobj.ktype = &kset_ktype;  //设置kobj_type
    kset->kobj.kset = NULL;

    return kset;
}

内核对象集由kset_init执行初始化：

/**
 * kset_init - initialize a kset for use
 * @k: kset
 */
void kset_init(struct kset *k)
{
    kobject_init_internal(&k->kobj);  //这里初始化
    INIT_LIST_HEAD(&k->list);
    spin_lock_init(&k->list_lock);
}

static void kobject_init_internal(struct kobject *kobj)
{
    if (!kobj)
        return;
    kref_init(&kobj->kref);
    INIT_LIST_HEAD(&kobj->entry);
    kobj->state_in_sysfs = 0;        //设置对应标志位
    kobj->state_add_uevent_sent = 0;
    kobj->state_remove_uevent_sent = 0;
    kobj->state_initialized = 1;
}

初始化kset之后，调用kset_register，将kset添加到sysfs：

/**
 * kset_register - initialize and add a kset.
 * @k: kset.
 */
int kset_register(struct kset *k)
{
    int err;

    if (!k)
        return -EINVAL;

    kset_init(k);
    err = kobject_add_internal(&k->kobj); //完成register动作，前面已说明
    if (err)
        return err;
    kobject_uevent(&k->kobj, KOBJ_ADD); //发送ADD事件到用户空间
    return 0;
}
EXPORT_SYMBOL(kset_register);

经过kset_create, kset_init和kset_register之后，kset已初始化并添加完成。当然kset_create_and_add包含了这三个函数。

内核对象集释放

内核对象的释放过程与kobject的释放过程类似，由kset_unregister完成：

/**
 * kset_unregister - remove a kset.
 * @k: kset.
 */
void kset_unregister(struct kset *k)
{
    if (!k)
        return;
    kobject_del(&k->kobj);  //删除sysfs的目录和属性文件，前面已说明
    kobject_put(&k->kobj);  //与kobject释放过程一致
}
EXPORT_SYMBOL(kset_unregister);

发送事件到用户空间

由前面的代码可以看到无论kobject或是kset，都会向用户空间发送事件，由kobject_uevent函数通过设置环境变量的方式完成：

struct kobj_uevent_env {
    char *argv[3];                //user_helper使用的命令
    char *envp[UEVENT_NUM_ENVP];  //环境变量数组
    int envp_idx;                 //当前环境变量索引
    char buf[UEVENT_BUFFER_SIZE]; //环境变量数据缓冲区
    int buflen;
};

/**
 * kobject_uevent - notify userspace by sending an uevent
 *
 * @action: action that is happening
 * @kobj: struct kobject that the action is happening to
 *
 * Returns 0 if kobject_uevent() is completed with success or the
 * corresponding error when it fails.
 */
int kobject_uevent(struct kobject *kobj, enum kobject_action action)
{
    return kobject_uevent_env(kobj, action, NULL); //实际完成发送函数
}
EXPORT_SYMBOL_GPL(kobject_uevent);

/**
 * kobject_uevent_env - send an uevent with environmental data
 *
 * @action: action that is happening
 * @kobj: struct kobject that the action is happening to
 * @envp_ext: pointer to environmental data
 *
 * Returns 0 if kobject_uevent_env() is completed with success or the
 * corresponding error when it fails.
 */
int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
               char *envp_ext[])
{
    struct kobj_uevent_env *env;
    const char *action_string = kobject_actions[action];
    const char *devpath = NULL;
    const char *subsystem;
    struct kobject *top_kobj;
    struct kset *kset;
    const struct kset_uevent_ops *uevent_ops;
    int i = 0;
    int retval = 0;
#ifdef CONFIG_NET
    struct uevent_sock *ue_sk;
#endif

    pr_debug("kobject: '%s' (%p): %s\n",
         kobject_name(kobj), kobj, __func__);

    /* search the kset we belong to */
    top_kobj = kobj;
    while (!top_kobj->kset && top_kobj->parent)  //寻找最近的kset，kset中有鍀event_ops
        top_kobj = top_kobj->parent;

    if (!top_kobj->kset) {
        pr_debug("kobject: '%s' (%p): %s: attempted to send uevent "
             "without kset!\n", kobject_name(kobj), kobj,
             __func__);
        return -EINVAL;
    }

    kset = top_kobj->kset;
    uevent_ops = kset->uevent_ops;  //使用kset中的uevent_ops执行发送操作

    /* skip the event, if uevent_suppress is set*/
    if (kobj->uevent_suppress) {  //跳过设置为uevent_suppress的kobject
        pr_debug("kobject: '%s' (%p): %s: uevent_suppress "
                 "caused the event to drop!\n",
                 kobject_name(kobj), kobj, __func__);
        return 0;
    }
    /* skip the event, if the filter returns zero. */
    if (uevent_ops && uevent_ops->filter)  //调用uevent_ops的filter函数
        if (!uevent_ops->filter(kset, kobj)) {
            pr_debug("kobject: '%s' (%p): %s: filter function "
                 "caused the event to drop!\n",
                 kobject_name(kobj), kobj, __func__);
            return 0;
        }

    /* originating subsystem */
    if (uevent_ops && uevent_ops->name)  //确定发送事件的kobject名字
        subsystem = uevent_ops->name(kset, kobj);
    else
        subsystem = kobject_name(&kset->kobj);
    if (!subsystem) {
        pr_debug("kobject: '%s' (%p): %s: unset subsystem caused the "
             "event to drop!\n", kobject_name(kobj), kobj,
             __func__);
        return 0;
    }

    /* environment buffer */
    env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); //分配kobj_uevent_env
    if (!env)
        return -ENOMEM;

    /* complete object path */
    devpath = kobject_get_path(kobj, GFP_KERNEL);
    if (!devpath) {
        retval = -ENOENT;
        goto exit;
    }

    /* default keys 添加环境变量 */
    retval = add_uevent_var(env, "ACTION=%s", action_string);
    if (retval)
        goto exit;
    retval = add_uevent_var(env, "DEVPATH=%s", devpath);
    if (retval)
        goto exit;
    retval = add_uevent_var(env, "SUBSYSTEM=%s", subsystem);
    if (retval)
        goto exit;

    /* keys passed in from the caller */
    if (envp_ext) {
        for (i = 0; envp_ext[i]; i++) {
            retval = add_uevent_var(env, "%s", envp_ext[i]);
            if (retval)
                goto exit;
        }
    }

    /* let the kset specific function add its stuff */
    if (uevent_ops && uevent_ops->uevent) { //调用uevent回调函数，添加子系统特定的环境变量
        retval = uevent_ops->uevent(kset, kobj, env);
        if (retval) {
            pr_debug("kobject: '%s' (%p): %s: uevent() returned "
                 "%d\n", kobject_name(kobj), kobj,
                 __func__, retval);
            goto exit;
        }
    }

    /*
     * Mark "add" and "remove" events in the object to ensure proper
     * events to userspace during automatic cleanup. If the object did
     * send an "add" event, "remove" will automatically generated by
     * the core, if not already done by the caller.
     */
    if (action == KOBJ_ADD)
        kobj->state_add_uevent_sent = 1;
    else if (action == KOBJ_REMOVE)
        kobj->state_remove_uevent_sent = 1;

    mutex_lock(&uevent_sock_mutex);
    /* we will send an event, so request a new sequence number */
    retval = add_uevent_var(env, "SEQNUM=%llu", (unsigned long long)++uevent_seqnum);
    if (retval) {
        mutex_unlock(&uevent_sock_mutex);
        goto exit;
    }

#if defined(CONFIG_NET)  //如果在编译时指定CONFIG_NET，使用netlink发送
    /* send netlink message */
    list_for_each_entry(ue_sk, &uevent_sock_list, list) {
        struct sock *uevent_sock = ue_sk->sk;
        struct sk_buff *skb;
        size_t len;

        if (!netlink_has_listeners(uevent_sock, 1))
            continue;

        /* allocate message with the maximum possible size */
        len = strlen(action_string) + strlen(devpath) + 2;
        skb = alloc_skb(len + env->buflen, GFP_KERNEL);
        if (skb) {
            char *scratch;

            /* add header */
            scratch = skb_put(skb, len);
            sprintf(scratch, "%s@%s", action_string, devpath);

            /* copy keys to our continuous event payload buffer */
            for (i = 0; i < env->envp_idx; i++) {
                len = strlen(env->envp[i]) + 1;
                scratch = skb_put(skb, len);
                strcpy(scratch, env->envp[i]);
            }

            NETLINK_CB(skb).dst_group = 1;
            retval = netlink_broadcast_filtered(uevent_sock, skb, //使用netlink多播发送
                                0, 1, GFP_KERNEL,
                                kobj_bcast_filter,
                                kobj);
            /* ENOBUFS should be handled in userspace */
            if (retval == -ENOBUFS || retval == -ESRCH)
                retval = 0;
        } else
            retval = -ENOMEM;
    }
#endif
    mutex_unlock(&uevent_sock_mutex);

#ifdef CONFIG_UEVENT_HELPER  //不能使用netlink时，使用user_helper发送
    /* call uevent_helper, usually only enabled during early boot */
    if (uevent_helper[0] && !kobj_usermode_filter(kobj)) {
        struct subprocess_info *info;

        retval = add_uevent_var(env, "HOME=/");
        if (retval)
            goto exit;
        retval = add_uevent_var(env,
                    "PATH=/sbin:/bin:/usr/sbin:/usr/bin");
        if (retval)
            goto exit;
        retval = init_uevent_argv(env, subsystem); //组装需要调用的用户空间命令和参数
        if (retval)
            goto exit;

        retval = -ENOMEM;
        info = call_usermodehelper_setup(env->argv[0], env->argv,  //调用用户空间程序/sbin/hotplug
                         env->envp, GFP_KERNEL,
                         NULL, cleanup_uevent_env, env);
        if (info) {
            retval = call_usermodehelper_exec(info, UMH_NO_WAIT);
            env = NULL; /* freed by cleanup_uevent_env */
        }
    }
#endif

exit:
    kfree(devpath);
    kfree(env);
    return retval;
}
EXPORT_SYMBOL_GPL(kobject_uevent_env);

sysfs与内核对象

本篇文章不是以文件系统的角度来详细描述sysfs，而是从内核对象如何通过sysfs表示整个设备驱动模型为切入点，进一步理解Linux内核对象。

内核对象添加到sysfs

在上文《内核对象与对象集》中，将kobject添加到sysfs中，kobject_add –> kobject_add_varg –> kobject_add_internal，调用create_dir创建sysfs目录和属性文件。

static int create_dir(struct kobject *kobj)
{
    const struct kobj_ns_type_operations *ops;
    int error;
        //调用sysfs接口创建kobject对应的目录
    error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj));
    if (error)
        return error;

    error = populate_dir(kobj);  //在kobject对应的目录中生成默认属性文件
    if (error) {
        sysfs_remove_dir(kobj);
        return error;
    }

    /*
     * @kobj->sd may be deleted by an ancestor going away.  Hold an
     * extra reference so that it stays until @kobj is gone.
     */
    sysfs_get(kobj->sd);

    /*
     * If @kobj has ns_ops, its children need to be filtered based on
     * their namespace tags.  Enable namespace support on @kobj->sd.
     */
    ops = kobj_child_ns_ops(kobj);
    if (ops) {
        BUG_ON(ops->type <= KOBJ_NS_TYPE_NONE);
        BUG_ON(ops->type >= KOBJ_NS_TYPES);
        BUG_ON(!kobj_ns_type_registered(ops->type));

        sysfs_enable_ns(kobj->sd);
    }

    return 0;
}

/*
 * populate_dir - populate directory with attributes.
 * @kobj: object we're working on.
 *
 * Most subsystems have a set of default attributes that are associated
 * with an object that registers with them.  This is a helper called during
 * object registration that loops through the default attributes of the
 * subsystem and creates attributes files for them in sysfs.
 */
static int populate_dir(struct kobject *kobj)
{
    struct kobj_type *t = get_ktype(kobj);
    struct attribute *attr;
    int error = 0;
    int i;

    if (t && t->default_attrs) {
        for (i = 0; (attr = t->default_attrs[i]) != NULL; i++) {
            error = sysfs_create_file(kobj, attr); //为每个属性创建对应的文件
            if (error)
                break;
        }
    }
    return error;
}

create_dir通过调用sysfs_create_dir_ns创建sysfs中的目录，调用sysfs_create_file创建属性文件。

sysfs的核心结构

kern_node代表sysfs中每个节点。

/*
 * kernfs_node - the building block of kernfs hierarchy.  Each and every
 * kernfs node is represented by single kernfs_node.  Most fields are
 * private to kernfs and shouldn't be accessed directly by kernfs users.
 *
 * As long as s_count reference is held, the kernfs_node itself is
 * accessible.  Dereferencing elem or any other outer entity requires
 * active reference.
 */
struct kernfs_node {
    atomic_t        count;   //引用计数
    atomic_t        active;  //活动的引用计数
#ifdef CONFIG_DEBUG_LOCK_ALLOC
    struct lockdep_map  dep_map;
#endif
    /*
     * Use kernfs_get_parent() and kernfs_name/path() instead of
     * accessing the following two fields directly.  If the node is
     * never moved to a different parent, it is safe to access the
     * parent directly.
     */
    struct kernfs_node  *parent; //指向父节点
    const char      *name;       //节点名称，在sysfs显示的名字

    struct rb_node      rb;      //接入sysfs红黑树的链接项

    const void      *ns;    /* namespace tag */
    unsigned int        hash;   /* ns + name hash 红黑树key */
    union {
        struct kernfs_elem_dir      dir;     //该kern_node类型为目录
        struct kernfs_elem_symlink  symlink; //该kern_node类型为链接
        struct kernfs_elem_attr     attr;    //该kern_node类型为属性文件
    };

    void            *priv;

    unsigned short      flags; //标记位，目录、链接、属性文件或是否已被删除
    umode_t         mode;      //访问权限，在sysfs中该kern_node的权限
    unsigned int        ino;   //唯一编号
    struct kernfs_iattrs    *iattr;  //用于设置非默认的inode属性，如果没有则置为NULL
};

在sysfs中创建目录sysfs_create_dir_ns

/**
 * sysfs_create_dir_ns - create a directory for an object with a namespace tag
 * @kobj: object we're creating directory for
 * @ns: the namespace tag to use
 */
int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
{
    struct kernfs_node *parent, *kn;

    BUG_ON(!kobj);

    if (kobj->parent)
        parent = kobj->parent->sd; //如果kobject设置parent,则使用之
    else
        parent = sysfs_root_kn;  //否则parent就设置为sysfs根目录

    if (!parent)
        return -ENOENT;
    //创建目录
    kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
                  S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns);
    if (IS_ERR(kn)) {
        if (PTR_ERR(kn) == -EEXIST)
            sysfs_warn_dup(parent, kobject_name(kobj));
        return PTR_ERR(kn);
    }

    kobj->sd = kn;
    return 0;
}

/**
 * kernfs_create_dir_ns - create a directory
 * @parent: parent in which to create a new directory
 * @name: name of the new directory
 * @mode: mode of the new directory
 * @priv: opaque data associated with the new directory
 * @ns: optional namespace tag of the directory
 *
 * Returns the created node on success, ERR_PTR() value on failure.
 */
struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
                     const char *name, umode_t mode,
                     void *priv, const void *ns)
{
    struct kernfs_node *kn;
    int rc;

    /* allocate 分配空间并初始化, KERNFS_DIR指定创建目录 */
    kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR);
    if (!kn)
        return ERR_PTR(-ENOMEM);

    kn->dir.root = parent->dir.root; //指向根目录kern_node
    kn->ns = ns;  //指定命名空间
    kn->priv = priv;

    /* link in */
    rc = kernfs_add_one(kn); //将kern_node加入父目录的红黑树中
    if (!rc)
        return kn;

    kernfs_put(kn);
    return ERR_PTR(rc);
}

kernfs_create_dir_ns函数中的两个主要函数kernfs_new_node和kernfs_add_one，在创建文件和创建符号链接同样使用，仅是参数不同。

为kern_node结构分配空间，并初始化

struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
                    const char *name, umode_t mode,
                    unsigned flags)
{
    struct kernfs_node *kn;
    //分配kern_node空间，并初始化
    kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags);
    if (kn) {
        kernfs_get(parent);
        kn->parent = parent;
    }
    return kn;
}

static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
                         const char *name, umode_t mode,
                         unsigned flags)
{
    struct kernfs_node *kn;
    int ret;

    name = kstrdup_const(name, GFP_KERNEL); //复制常量字符串
    if (!name)
        return NULL;

    kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); //在缓存空间分配kernfs_node
    if (!kn)
        goto err_out1;

    /*
     * If the ino of the sysfs entry created for a kmem cache gets
     * allocated from an ida layer, which is accounted to the memcg that
     * owns the cache, the memcg will get pinned forever. So do not account
     * ino ida allocations.
     */
    ret = ida_simple_get(&root->ino_ida, 1, 0,  //获取唯一标号，用于唯一标示kern_node
                 GFP_KERNEL | __GFP_NOACCOUNT);
    if (ret < 0)
        goto err_out2;
    kn->ino = ret;

    atomic_set(&kn->count, 1);  //更新引用计数
    atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
    RB_CLEAR_NODE(&kn->rb);
    //设置kern_node相关域
    kn->name = name;
    kn->mode = mode;
    kn->flags = flags;

    return kn;

 err_out2:
    kmem_cache_free(kernfs_node_cache, kn);
 err_out1:
    kfree_const(name);
    return NULL;
}

将kern_node添加到parent的红黑树中：

/**
 *  kernfs_add_one - add kernfs_node to parent without warning
 *  @kn: kernfs_node to be added
 *
 *  The caller must already have initialized @kn->parent.  This
 *  function increments nlink of the parent's inode if @kn is a
 *  directory and link into the children list of the parent.
 *
 *  RETURNS:
 *  0 on success, -EEXIST if entry with the given name already
 *  exists.
 */
int kernfs_add_one(struct kernfs_node *kn)
{
    struct kernfs_node *parent = kn->parent;
    struct kernfs_iattrs *ps_iattr;
    bool has_ns;
    int ret;

    mutex_lock(&kernfs_mutex);

    ret = -EINVAL;
    has_ns = kernfs_ns_enabled(parent);
    if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
         has_ns ? "required" : "invalid", parent->name, kn->name))
        goto out_unlock;

    if (kernfs_type(parent) != KERNFS_DIR) //检查parent是否为目录
        goto out_unlock;

    ret = -ENOENT;
    if (parent->flags & KERNFS_EMPTY_DIR)  //检查parent是否为空目录
        goto out_unlock;
    //检查parent是否是active状态
    if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
        goto out_unlock;

    kn->hash = kernfs_name_hash(kn->name, kn->ns); //作为红黑树比较的key

    ret = kernfs_link_sibling(kn); //kern_node链入parent节点红黑树中
    if (ret)
        goto out_unlock;

    /* Update timestamps on the parent */
    ps_iattr = parent->iattr;
    if (ps_iattr) {
        struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
        ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
    }

    mutex_unlock(&kernfs_mutex);

    /*
     * Activate the new node unless CREATE_DEACTIVATED is requested.
     * If not activated here, the kernfs user is responsible for
     * activating the node with kernfs_activate().  A node which hasn't
     * been activated is not visible to userland and its removal won't
     * trigger deactivation.
     */
    if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
        kernfs_activate(kn);
    return 0;

out_unlock:
    mutex_unlock(&kernfs_mutex);
    return ret;
}

sysfs红黑树中的key：

/**
 *  kernfs_name_hash
 *  @name: Null terminated string to hash
 *  @ns:   Namespace tag to hash
 *
 *  Returns 31 bit hash of ns + name (so it fits in an off_t )
 */
static unsigned int kernfs_name_hash(const char *name, const void *ns)
{
    unsigned long hash = init_name_hash();
    unsigned int len = strlen(name);
    while (len--)
        hash = partial_name_hash(*name++, hash);
    hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
    hash &= 0x7fffffffU;
    /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
    if (hash < 2)
        hash += 2;
    if (hash >= INT_MAX)
        hash = INT_MAX - 1;
    return hash;
}

static int kernfs_name_compare(unsigned int hash, const char *name,
                   const void *ns, const struct kernfs_node *kn)
{
    if (hash < kn->hash)
        return -1;
    if (hash > kn->hash)
        return 1;
    if (ns < kn->ns)
        return -1;
    if (ns > kn->ns)
        return 1;
    return strcmp(name, kn->name);
}

kernfs_name_hash: 根据name和ns计算kern_node的hash值，保存在kern_node.hash域中。
kernfs_name_compare: sysfs红黑树key的比较函数, 比较优先级是： hash > ns > name

kern_node链入parent节点红黑树中：

/**
 *  kernfs_link_sibling - link kernfs_node into sibling rbtree
 *  @kn: kernfs_node of interest
 *
 *  Link @kn into its sibling rbtree which starts from
 *  @kn->parent->dir.children.
 *
 *  Locking:
 *  mutex_lock(kernfs_mutex)
 *
 *  RETURNS:
 *  0 on susccess -EEXIST on failure.
 */
static int kernfs_link_sibling(struct kernfs_node *kn)
{
    struct rb_node **node = &kn->parent->dir.children.rb_node; //parent目录的红黑树
    struct rb_node *parent = NULL;

    while (*node) {  //在parent的目录中，寻找合适的位置将kn插入parent的红黑树中
        struct kernfs_node *pos;
        int result;

        pos = rb_to_kn(*node);
        parent = *node;
        result = kernfs_sd_compare(kn, pos); //优先顺序: hash > ns > name
        if (result < 0)
            node = &pos->rb.rb_left;
        else if (result > 0)
            node = &pos->rb.rb_right;
        else
            return -EEXIST;
    }

    /* add new node and rebalance the tree */
    rb_link_node(&kn->rb, parent, node);
    rb_insert_color(&kn->rb, &kn->parent->dir.children);

    /* successfully added, account subdir number */
    if (kernfs_type(kn) == KERNFS_DIR)
        kn->parent->dir.subdirs++;

    return 0;
}

在sysfs中创建文件

static inline int __must_check sysfs_create_file(struct kobject *kobj,
                         const struct attribute *attr)
{
    return sysfs_create_file_ns(kobj, attr, NULL);
}

/**
 * sysfs_create_file_ns - create an attribute file for an object with custom ns
 * @kobj: object we're creating for
 * @attr: attribute descriptor
 * @ns: namespace the new file should belong to
 */
int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
             const void *ns)
{
    BUG_ON(!kobj || !kobj->sd || !attr);

    return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns);

}
EXPORT_SYMBOL_GPL(sysfs_create_file_ns);

int sysfs_add_file_mode_ns(struct kernfs_node *parent,
               const struct attribute *attr, bool is_bin,
               umode_t mode, const void *ns)
{
    struct lock_class_key *key = NULL;
    const struct kernfs_ops *ops;
    struct kernfs_node *kn;
    loff_t size;

    if (!is_bin) {
        struct kobject *kobj = parent->priv;
        const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;

        /* every kobject with an attribute needs a ktype assigned */
        if (WARN(!sysfs_ops, KERN_ERR
             "missing sysfs attribute operations for kobject: %s\n",
             kobject_name(kobj)))
            return -EINVAL;
        //确定读写的操作函数
        if (sysfs_ops->show && sysfs_ops->store) {
            if (mode & SYSFS_PREALLOC)
                ops = &sysfs_prealloc_kfops_rw;
            else
                ops = &sysfs_file_kfops_rw;
        } else if (sysfs_ops->show) {
            if (mode & SYSFS_PREALLOC)
                ops = &sysfs_prealloc_kfops_ro;
            else
                ops = &sysfs_file_kfops_ro;
        } else if (sysfs_ops->store) {
            if (mode & SYSFS_PREALLOC)
                ops = &sysfs_prealloc_kfops_wo;
            else
                ops = &sysfs_file_kfops_wo;
        } else
            ops = &sysfs_file_kfops_empty;

        size = PAGE_SIZE;
    } else {
        struct bin_attribute *battr = (void *)attr;

        if (battr->mmap)
            ops = &sysfs_bin_kfops_mmap;
        else if (battr->read && battr->write)
            ops = &sysfs_bin_kfops_rw;
        else if (battr->read)
            ops = &sysfs_bin_kfops_ro;
        else if (battr->write)
            ops = &sysfs_bin_kfops_wo;
        else
            ops = &sysfs_file_kfops_empty;

        size = battr->size;
    }

#ifdef CONFIG_DEBUG_LOCK_ALLOC
    if (!attr->ignore_lockdep)
        key = attr->key ?: (struct lock_class_key *)&attr->skey;
#endif
    kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops,
                  (void *)attr, ns, key); //创建属性文件
    if (IS_ERR(kn)) {
        if (PTR_ERR(kn) == -EEXIST)
            sysfs_warn_dup(parent, attr->name);
        return PTR_ERR(kn);
    }
    return 0;
}

通过上面的代码跟踪，创建属性文件由__kernfs_create_file实现，最终仍然是调用kernfs_new_node和kernfs_add_one。

/**
 * __kernfs_create_file - kernfs internal function to create a file
 * @parent: directory to create the file in
 * @name: name of the file
 * @mode: mode of the file
 * @size: size of the file
 * @ops: kernfs operations for the file
 * @priv: private data for the file
 * @ns: optional namespace tag of the file
 * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
 *
 * Returns the created node on success, ERR_PTR() value on error.
 */
struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
                     const char *name,
                     umode_t mode, loff_t size,
                     const struct kernfs_ops *ops,
                     void *priv, const void *ns,
                     struct lock_class_key *key)
{
    struct kernfs_node *kn;
    unsigned flags;
    int rc;

    flags = KERNFS_FILE; //创建的kern_node类型为file
    //分配空间并初始化
    kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
    if (!kn)
        return ERR_PTR(-ENOMEM);

    kn->attr.ops = ops;
    kn->attr.size = size;
    kn->ns = ns;
    kn->priv = priv;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
    if (key) {
        lockdep_init_map(&kn->dep_map, "s_active", key, 0);
        kn->flags |= KERNFS_LOCKDEP;
    }
#endif

    /*
     * kn->attr.ops is accesible only while holding active ref.  We
     * need to know whether some ops are implemented outside active
     * ref.  Cache their existence in flags.
     */
    if (ops->seq_show)
        kn->flags |= KERNFS_HAS_SEQ_SHOW;
    if (ops->mmap)
        kn->flags |= KERNFS_HAS_MMAP;

    rc = kernfs_add_one(kn); //将kern_node添加到parent的红黑树中
    if (rc) {
        kernfs_put(kn);
        return ERR_PTR(rc);
    }
    return kn;
}

在sysfs_add_file_mode_ns函数中根据flags的不同，注册不同的读写回调函数，下面以sysfs_prealloc_kfops_rw函数为例，其他结构类似，不赘述。

//常规文件--sysfs_prealloc_kfops_rw
static const struct kernfs_ops sysfs_prealloc_kfops_rw = {
    .read       = sysfs_kf_read,
    .write      = sysfs_kf_write,
    .prealloc   = true,
};

/* kernfs read callback for regular sysfs files with pre-alloc */
static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
                 size_t count, loff_t pos)
{
    const struct sysfs_ops *ops = sysfs_file_ops(of->kn); //获取kobject中的sysfs_ops操作表
    struct kobject *kobj = of->kn->parent->priv;
    size_t len;

    /*
     * If buf != of->prealloc_buf, we don't know how
     * large it is, so cannot safely pass it to ->show
     */
    if (pos || WARN_ON_ONCE(buf != of->prealloc_buf))
        return 0;
    len = ops->show(kobj, of->kn->priv, buf); //kobject中sd域的sysfs_ops操作表中的show
    return min(count, len);
}

/* kernfs write callback for regular sysfs files */
static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
                  size_t count, loff_t pos)
{   //获取kobject中的sysfs_ops操作表
    const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
    struct kobject *kobj = of->kn->parent->priv;

    if (!count)
        return 0;

    return ops->store(kobj, of->kn->priv, buf, count); //kobject中sd域的sysfs_ops操作表中的store
}

关于属性文件的读写操作，最终都回调到kobject中的sd域的sysfs_ops操作表，这个操作表示在kobject_init函数中设置。回顾kobject_create函数：

struct kobject *kobject_create(void)
{
    struct kobject *kobj;

    kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); //分配空间
    if (!kobj)
        return NULL;

    kobject_init(kobj, &dynamic_kobj_ktype);  //初始化, kobj_type类型为dynamic_kobj_ktype
    return kobj;
}

//注册如下结构
static struct kobj_type dynamic_kobj_ktype = {
    .release    = dynamic_kobj_release,
    .sysfs_ops  = &kobj_sysfs_ops,
};

const struct sysfs_ops kobj_sysfs_ops = {
    .show   = kobj_attr_show,
    .store  = kobj_attr_store,
};
EXPORT_SYMBOL_GPL(kobj_sysfs_ops);

kobject的sysfs的show和store方法为：kobj_attr_show和kobj_attr_store

static ssize_t kobj_attr_show(struct kobject *kobj, struct attribute *attr,
                  char *buf)
{
    struct kobj_attribute *kattr;
    ssize_t ret = -EIO;

    kattr = container_of(attr, struct kobj_attribute, attr);
    if (kattr->show)  //如果业务子系统设置了show函数，则调用
        ret = kattr->show(kobj, kattr, buf);
    return ret;
}

static ssize_t kobj_attr_store(struct kobject *kobj, struct attribute *attr,
                   const char *buf, size_t count)
{
    struct kobj_attribute *kattr;
    ssize_t ret = -EIO;

    kattr = container_of(attr, struct kobj_attribute, attr);
    if (kattr->store)  //如果业务子系统设置了store函数，则调用
        ret = kattr->store(kobj, kattr, buf, count);
    return ret;
}

真正的对属性文件进行读写的回调由业务子系统实现。

在sysfs中创建符号链接

/**
 *  sysfs_create_link - create symlink between two objects.
 *  @kobj:  object whose directory we're creating the link in.
 *  @target:    object we're pointing to.
 *  @name:      name of the symlink.
 */
int sysfs_create_link(struct kobject *kobj, struct kobject *target,
              const char *name)
{
    return sysfs_do_create_link(kobj, target, name, 1);
}
EXPORT_SYMBOL_GPL(sysfs_create_link);

static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
                const char *name, int warn)
{
    struct kernfs_node *parent = NULL;

    if (!kobj)
        parent = sysfs_root_kn;
    else
        parent = kobj->sd;

    if (!parent)
        return -EFAULT;

    return sysfs_do_create_link_sd(parent, target, name, warn);
}

static int sysfs_do_create_link_sd(struct kernfs_node *parent,
                   struct kobject *target_kobj,
                   const char *name, int warn)
{
    struct kernfs_node *kn, *target = NULL;

    BUG_ON(!name || !parent);

    /*
     * We don't own @target_kobj and it may be removed at any time.
     * Synchronize using sysfs_symlink_target_lock.  See
     * sysfs_remove_dir() for details.
     */
    spin_lock(&sysfs_symlink_target_lock);
    if (target_kobj->sd) {
        target = target_kobj->sd;
        kernfs_get(target);
    }
    spin_unlock(&sysfs_symlink_target_lock);

    if (!target)
        return -ENOENT;

    kn = kernfs_create_link(parent, name, target); //创建sysfs符号链接
    kernfs_put(target);

    if (!IS_ERR(kn))
        return 0;

    if (warn && PTR_ERR(kn) == -EEXIST)
        sysfs_warn_dup(parent, name);
    return PTR_ERR(kn);
}

由上面的代码追踪，创建符号链接由kernfs_create_link函数上。

/**
 * kernfs_create_link - create a symlink
 * @parent: directory to create the symlink in
 * @name: name of the symlink
 * @target: target node for the symlink to point to
 *
 * Returns the created node on success, ERR_PTR() value on error.
 */
struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
                       const char *name,
                       struct kernfs_node *target)
{
    struct kernfs_node *kn;
    int error;
    //指定创建符号链接
    kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK);
    if (!kn)
        return ERR_PTR(-ENOMEM);

    if (kernfs_ns_enabled(parent))
        kn->ns = target->ns;
    kn->symlink.target_kn = target;
    kernfs_get(target); /* ref owned by symlink */

    error = kernfs_add_one(kn); //将kern_node添加到parent的红黑树中
    if (!error)
        return kn;

    kernfs_put(kn);
    return ERR_PTR(error);
}

与创建目录和文件类似，最终仍然是调用kernfs_new_node和kernfs_add_one实现。

基于内核对象编程套路

目标：在sysfs中创建一个目录/sys/kernel/storage/，在该目录下，还创建了一个文件value。value可以写入整型数据，随后可以读出。
* 定义内核对象

struct storage_obj {
    struct kobject kobj;
    int val;  //用于保存写入的数据
};

定义属性类型

struct storage_attribute {
    struct attribute *attr;
    ssize_t (*show)(struct kobject *, struct attribute *, char *);
    ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t);
}

声明属性
定义属性的show和store方法，如下：

//定义并初始化storage_attribute
struct storage_attribute *sattr = &struct storage_attribute {
    .attr = {.name = "value", .mode = 0666},
    .show = storage_show,
    .store = storage_store,
};

实现sysfs操作

ssize_t storage_show(struct kobject *kobj, struct attribute *attr, char *buf) 
{
    struct storage *stor = container_of(kobj, struct storage_obj, kobj);
    stor->val = atoi(buf);
}

ssize_t storage_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t s) {
    struct storage *stor = container_of(kobj, struct storage_obj, kobj);
    memcpy(buf, s, itoa(stor->val));
}

定义内核对象release方法
release方法设置在kobj_type结构中

void storage_release(struct kobject *kobj)
{
    ......
}

声明内核对象类型

struct storage_ktype {
    struct kobj_type *ktype;
}

封装对象属性添加和删除方法
需要将value属性添加到内核对象，或者从内核对象删除，可以直接调用sysfs_create_file和sysfs_remove_file。但大多数情况下，会对这两个方法做一层封装：storage_create_file和storage_remove_file。

int storage_create_file(struct storage_obj *sobj, const struct storage_attribute *attr)
{
    int error = 0;
    if (sobj) {
        error = sysfs_create_file(&sobj->kobj, &attr->attr);
    }
    return error;
}

void storage_remove_file(struct storage_obj *sobj, const struct storage_attribute *attr)
{
    if (sobj) {
        sysfs_remove_file(&sobj->kobj, &attr->attr);
    }
}

定义对象的创建和销毁方法

struct storage_obj * create_storage_obj() 
{
    struct storage_obj *sobj = (struct storage_obj *)malloc(struct storage_obj);
    struct storage_ktype *stype = (struct storage_ktype *)malloc(struct storage_ktype);
    sobj->parent = kernel_kobj;
    kobject_init_and_add(&sobj->kobj, &stype->ktype);

    return sobj
}

void destroy_storage_obj(struct kobject *kobj) {
    struct storage_obj *sobj = container_of(kobj, struct storage_obj, kobj);

    kobject_del(kboj);
    free(sobj);
    free(stype);
}

实现模块加载和卸载方法
加载时调用create_storage_obj，卸载时调用destroy_storage_obj

设备驱动模型

概述

Linux的设备驱动模型能够带来以下的优点：
* 使用统一机制来表达设备与驱动之间的关系，规范设备驱动的编写，核心代码复用。
* 将系统中的设备以树结构组织，并且通过sysfs将其呈现在用户空间——包括所有的总线和内部连接。
* 支持设备的热拔插机制。
* 支持通用的电源管理机制，通过由叶子节点到根节点的方向遍历设备树，确保子设备在父设备之前断电。

内核基于内核对象和sysfs，通过抽象以下五种概念，实现了设备驱动模型的框架，使得编写子系统成为“八股文”。
1. bus_type: 总线类型，每个子系统有且只有一个总线类型，由bus_type和subsys_private两个结构共同描述。
2. device: 设备，描述挂在总线类型中的设备，由device和device_private两个结构共同描述。
3. driver: 驱动，描述挂在总线类型中的驱动模块，由device_driver和driver_private两个结构共同描述。
4. class: 类，每个总线类型有且只有一个类，由class和subsys_private两个结构共同描述。
5. class_interface: 接口，每个类有多个接口，由class_interface结构描述。

在Linux内核中，子系统是由bus_type, device, driver, class和class_interface之间的关系所描述，而设备驱动模型正是这些关系的核心实现，使得在编写子系统程序时，只要遵循设备模型的套路，便不需要关注于这些复杂的关系，只需实现自身的业务逻辑。

每个子系统都有一个总线类型，总线类型拥有一个设备链表和一个驱动链表，用于连接由该总线类型已发现的设备和已加载的驱动，设备发现和驱动加载的顺序是任意的。每个设备最多绑定到一个驱动，被绑定了驱动的设备可以正常工作。除此之外，每个设备可以唯一属于某个类，类中包含多个接口，接口的方法作用于设备，不管是先添加接口，还是先发现设备。

总线类型

总线类型的数据结构

struct bus_type {
    const char      *name;         //子系统名称
    const char      *dev_name;     //供子系统生成设备名称使用
    struct device       *dev_root;
    struct device_attribute *dev_attrs; /* use dev_groups instead */
    const struct attribute_group **bus_groups;  //总线类型使用的属性组
    const struct attribute_group **dev_groups;  //设备使用的属性组
    const struct attribute_group **drv_groups;  //驱动使用的属性组

    int (*match)(struct device *dev, struct device_driver *drv);    //检测设备与驱动是否可以绑定
    int (*uevent)(struct device *dev, struct kobj_uevent_env *env); //发送事件前，设置bus特有的环境变量
    int (*probe)(struct device *dev);     //当设备可以绑定到驱动时，对设备进行初始化和执行绑定
    int (*remove)(struct device *dev);    //当设备从驱动中解绑时，回调
    void (*shutdown)(struct device *dev); //当设备断电时，回调

    int (*online)(struct device *dev);    //当设备上电时，回调
    int (*offline)(struct device *dev);   //当设备下电时，回调

    int (*suspend)(struct device *dev, pm_message_t state); //当设备进入节能状态时，回调
    int (*resume)(struct device *dev);                      //当设备恢复正常状态时，回调

    const struct dev_pm_ops *pm;  //电源管理相关

    const struct iommu_ops *iommu_ops;

    struct subsys_private *p;         //子系统私有类型
    struct lock_class_key lock_key;
};

struct subsys_private {
    struct kset subsys;          //总线kset，scsi子系统对应/sys/bus/scsi
    struct kset *devices_kset;   //设备kset， scsi子系统对应/sys/bus/scsi/devices
    struct list_head interfaces; //总线的接口链表
    struct mutex mutex;          

    struct kset *drivers_kset;   //驱动kset， scsi子系统对应/sys/bus/scsi/drivers
    struct klist klist_devices;  //总线的设备链表
    struct klist klist_drivers;  //总线的驱动链表
    struct blocking_notifier_head bus_notifier; //子系统变化时，需要通知的链表
    unsigned int drivers_autoprobe:1;  //是否允许设备或驱动加载时，自动探测
    struct bus_type *bus;        //指向总线类型

    struct kset glue_dirs;
    struct class *class;         //指向总线类型的类
};

从上面的两个结构可以看到，bus_type包含的主要是实现子系统应该具体关注的比如name，一组回调函数。而subsys_private结构主要是设备驱动模型中的关系的表达，如字段subsys的类型是kset，描述该子系统在sysfs中的表达；klist_devices和klist_drivers分别是设备链表和驱动链表，用于管理总线类型的所有设备和驱动。之后仍然会遇到xxx_private的结构，以这种方式命名的结构，都是给设备驱动模型核心使用的，业务子系统无需也不能使用。

总线类型注册/反注册

实现子系统的第一步就是创建bus_type，并将其注册到系统，此时需要调用bus_register:

/**
 * bus_register - register a driver-core subsystem
 * @bus: bus to register
 *
 * Once we have that, we register the bus with the kobject
 * infrastructure, then register the children subsystems it has:
 * the devices and drivers that belong to the subsystem.
 */
int bus_register(struct bus_type *bus)
{
    int retval;
    struct subsys_private *priv;
    struct lock_class_key *key = &bus->lock_key;
    //分配总线类型私有数据空间
    priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL);
    if (!priv)
        return -ENOMEM;

    priv->bus = bus; //关联bus_type和subsys_private
    bus->p = priv;

    BLOCKING_INIT_NOTIFIER_HEAD(&priv->bus_notifier);
    //设置总线类型名称到kobject中，在sysfs中显示
    retval = kobject_set_name(&priv->subsys.kobj, "%s", bus->name);
    if (retval)
        goto out;

    priv->subsys.kobj.kset = bus_kset;
    priv->subsys.kobj.ktype = &bus_ktype;
    priv->drivers_autoprobe = 1;    //开启自动探测

    retval = kset_register(&priv->subsys);  //将总线类型添加到设备模型中
    if (retval)
        goto out;

    retval = bus_create_file(bus, &bus_attr_uevent); //创建uevent属性文件
    if (retval)
        goto bus_uevent_fail;

    priv->devices_kset = kset_create_and_add("devices", NULL,  //创建devices目录
                         &priv->subsys.kobj);
    if (!priv->devices_kset) {
        retval = -ENOMEM;
        goto bus_devices_fail;
    }

    priv->drivers_kset = kset_create_and_add("drivers", NULL,  //创建drivers目录
                         &priv->subsys.kobj);
    if (!priv->drivers_kset) {
        retval = -ENOMEM;
        goto bus_drivers_fail;
    }
    //初始化链表和锁
    INIT_LIST_HEAD(&priv->interfaces);
    __mutex_init(&priv->mutex, "subsys mutex", key);
    klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put);
    klist_init(&priv->klist_drivers, NULL, NULL);

    retval = add_probe_files(bus); //在sysfs中添加探测文件drivers_autoprobe和drivers_probe
    if (retval)
        goto bus_probe_files_fail;

    retval = bus_add_groups(bus, bus->bus_groups); //添加总线类型的属性文件
    if (retval)
        goto bus_groups_fail;

    pr_debug("bus: '%s': registered\n", bus->name);
    return 0;
    //失败回滚操作
bus_groups_fail:
    remove_probe_files(bus);
bus_probe_files_fail:
    kset_unregister(bus->p->drivers_kset);
bus_drivers_fail:
    kset_unregister(bus->p->devices_kset);
bus_devices_fail:
    bus_remove_file(bus, &bus_attr_uevent);
bus_uevent_fail:
    kset_unregister(&bus->p->subsys);
out:
    kfree(bus->p);
    bus->p = NULL;
    return retval;
}
EXPORT_SYMBOL_GPL(bus_register);

注册总线类型后，便可以在系统看到：

root@ubuntu16:~# ls /sys/bus/scsi -l
total 0
drwxr-xr-x 2 root root    0 Sep  5 16:01 devices
drwxr-xr-x 4 root root    0 Sep  2 09:44 drivers
-rw-r--r-- 1 root root 4096 Sep  5 11:29 drivers_autoprobe
--w------- 1 root root 4096 Sep  5 11:29 drivers_probe
--w------- 1 root root 4096 Sep  2 09:44 uevent
root@ubuntu16:~#

当从系统中注销子系统时，需要调用bus_unregister，完成总线类型的反注册：

/**
 * bus_unregister - remove a bus from the system
 * @bus: bus.
 *
 * Unregister the child subsystems and the bus itself.
 * Finally, we call bus_put() to release the refcount
 */
void bus_unregister(struct bus_type *bus)
{
    pr_debug("bus: '%s': unregistering\n", bus->name);
    if (bus->dev_root)
        device_unregister(bus->dev_root);     //删除根设备
    bus_remove_groups(bus, bus->bus_groups);  //删除总线的属性文件
    remove_probe_files(bus);                  //删除探测文件drivers_autoprobe和drivers_probe
    kset_unregister(bus->p->drivers_kset);    //删除drivers目录
    kset_unregister(bus->p->devices_kset);    //删除devices目录
    bus_remove_file(bus, &bus_attr_uevent);   //删除uevent文件
    kset_unregister(&bus->p->subsys);         //删除总线目录
}
EXPORT_SYMBOL_GPL(bus_unregister);

设备

设备的数据结构

struct device {
    struct device       *parent;  //指向父设备，eg.HBA

    struct device_private   *p;   //设备私有指针

    struct kobject kobj;          //内嵌kobject
    const char      *init_name; /* initial name of the device */
    const struct device_type *type;  //设备类型，抽象出来的域和方法

    struct mutex        mutex;  /* mutex to synchronize calls to its driver */

    struct bus_type *bus;       /* type of bus device is on; devive归属的bus */
    struct device_driver *driver;   /* which driver has allocated this device */
    void        *platform_data; /* Platform specific data, device core doesn't touch it */
    void        *driver_data;   /* Driver data, set and get with dev_set/get_drvdata */
    struct dev_pm_info  power;
    struct dev_pm_domain    *pm_domain;

#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
    struct irq_domain   *msi_domain;
#endif
#ifdef CONFIG_PINCTRL
    struct dev_pin_info *pins;
#endif
#ifdef CONFIG_GENERIC_MSI_IRQ
    struct list_head    msi_list;
#endif

#ifdef CONFIG_NUMA
    int     numa_node;  /* NUMA node this device is close to */
#endif
    u64     *dma_mask;  /* dma mask (if dma'able device) */
    u64     coherent_dma_mask;/* Like dma_mask, but for
                         alloc_coherent mappings as
                         not all hardware supports
                         64 bit addresses for consistent
                         allocations such descriptors. */
    unsigned long   dma_pfn_offset;

    struct device_dma_parameters *dma_parms;

    struct list_head    dma_pools;  /* dma pools (if dma'ble) */

    struct dma_coherent_mem *dma_mem; /* internal for coherent mem override */
#ifdef CONFIG_DMA_CMA
    struct cma *cma_area;       /* contiguous memory area for dma allocations */
#endif
    /* arch specific additions */
    struct dev_archdata archdata;

    struct device_node  *of_node; /* associated device tree node */
    struct fwnode_handle    *fwnode; /* firmware device node */

    dev_t           devt;   /* dev_t, creates the sysfs "dev"； 设备号 */
    u32         id; /* device instance */

    spinlock_t      devres_lock;
    struct list_head    devres_head; //设备资源链表头

    struct klist_node   knode_class; //链入类的设备链表
    struct class        *class;      //指向链入的类
    const struct attribute_group **groups;  /* optional groups 设备特有的属性 */

    void    (*release)(struct device *dev);  //设备是否回调
    struct iommu_group  *iommu_group;

    bool            offline_disabled:1;
    bool            offline:1;
};

struct device_private {
    struct klist klist_children;     //子设备链表
    struct klist_node knode_parent;  //链入父设备的children链表
    struct klist_node knode_driver;  //链入驱动的设备链表中
    struct klist_node knode_bus;     //链入总线的设备链表
    struct list_head deferred_probe; //链入延迟探测链表
    struct device *device;           //指向关联的device
};

struct device_type {
    const char *name;  //设备类型的名称
    const struct attribute_group **groups;  //设备的公有属性组
    int (*uevent)(struct device *dev, struct kobj_uevent_env *env); //发送事件前调用，用于设置事件环境变量
    char *(*devnode)(struct device *dev, umode_t *mode, //在创建设备时，提供名字线索
             kuid_t *uid, kgid_t *gid);
    void (*release)(struct device *dev);    //设备释放时回调

    const struct dev_pm_ops *pm;
};

在设备驱动模型中，device结构有bus域，指向device所属的总线类型；class域指向device所属的唯一的类；driver域指向设备所绑定的驱动。与内核对象一样，设备也被组织层层次结构，通过parent指向父设备。

device_private结构由设备驱动模型处理，维护和其他结构之间的内部关系。device_type结构定义设备公有的属性和方法。

设备的注册与反注册

当设备被发现后，需要将设备注册到系统，需要调用device_register函数：

/**
 * device_register - register a device with the system.
 * @dev: pointer to the device structure
 *
 * This happens in two clean steps - initialize the device
 * and add it to the system. The two steps can be called
 * separately, but this is the easiest and most common.
 * I.e. you should only call the two helpers separately if
 * have a clearly defined need to use and refcount the device
 * before it is added to the hierarchy.
 *
 * For more information, see the kerneldoc for device_initialize()
 * and device_add().
 *
 * NOTE: _Never_ directly free @dev after calling this function, even
 * if it returned an error! Always use put_device() to give up the
 * reference initialized in this function instead.
 */
int device_register(struct device *dev)
{
    device_initialize(dev);  //初始化device结构
    return device_add(dev);  //将设备添加到系统
}
EXPORT_SYMBOL_GPL(device_register);


void device_initialize(struct device *dev)
{
    dev->kobj.kset = devices_kset;             // /sys/devices/
    kobject_init(&dev->kobj, &device_ktype);   // device的类型为device_ktype
    INIT_LIST_HEAD(&dev->dma_pools);
    mutex_init(&dev->mutex);
    lockdep_set_novalidate_class(&dev->mutex);
    spin_lock_init(&dev->devres_lock);
    INIT_LIST_HEAD(&dev->devres_head);
    device_pm_init(dev);
    set_dev_node(dev, -1);
#ifdef CONFIG_GENERIC_MSI_IRQ
    INIT_LIST_HEAD(&dev->msi_list);
#endif
}
EXPORT_SYMBOL_GPL(device_initialize);

device_register函数调用device_initialize对device结构进行初始化，调用device_add函数完成设备添加到系统。

int device_add(struct device *dev)
{
    struct device *parent = NULL;
    struct kobject *kobj;
    struct class_interface *class_intf;
    int error = -EINVAL;

    dev = get_device(dev);
    if (!dev)
        goto done;

    if (!dev->p) {  //如果device没有设置devcie_private，在这里分配并初始化
        error = device_private_init(dev);
        if (error)
            goto done;
    }

    /*
     * for statically allocated devices, which should all be converted
     * some day, we need to initialize the name. We prevent reading back
     * the name, and force the use of dev_name()
     */
    if (dev->init_name) {
        dev_set_name(dev, "%s", dev->init_name); //设置device的kobject名字
        dev->init_name = NULL;
    }

    /* subsystems can specify simple device enumeration */
    if (!dev_name(dev) && dev->bus && dev->bus->dev_name) //如果device没有设置init_name, 则使用bus的dev_name和设备id生成
        dev_set_name(dev, "%s%u", dev->bus->dev_name, dev->id);

    if (!dev_name(dev)) {
        error = -EINVAL;
        goto name_error;
    }

    pr_debug("device: '%s': %s\n", dev_name(dev), __func__);

    parent = get_device(dev->parent);
    kobj = get_device_parent(dev, parent);
    if (kobj)
        dev->kobj.parent = kobj;  //设置device的kobject的parent字段

    /* use parent numa_node */
    if (parent && (dev_to_node(dev) == NUMA_NO_NODE))
        set_dev_node(dev, dev_to_node(parent));

    /* first, register with generic layer. */
    /* we require the name to be set before, and pass NULL */
    error = kobject_add(&dev->kobj, dev->kobj.parent, NULL); //将device添加到parent的目录中
    if (error)
        goto Error;

    /* notify platform of device entry */
    if (platform_notify)
        platform_notify(dev);

    error = device_create_file(dev, &dev_attr_uevent); //在设备目录下创建uevent文件
    if (error)
        goto attrError;

    error = device_add_class_symlinks(dev); //为设备创建和类相关的符号链接
    if (error)
        goto SymlinkError;
    error = device_add_attrs(dev); //为设备的默认属性添加对应的文件
    if (error)
        goto AttrsError;
    error = bus_add_device(dev);  //将device添加到bus_type
    if (error)
        goto BusError;
    error = dpm_sysfs_add(dev);
    if (error)
        goto DPMError;
    device_pm_add(dev);

    if (MAJOR(dev->devt)) {
        error = device_create_file(dev, &dev_attr_dev); //在设备目录下创建dev属性对应文件，用于保存设备号
        if (error)
            goto DevAttrError;

        error = device_create_sys_dev_entry(dev); //在/sys/block和/sys/char创建一个到设备所在目录的符号链接
        if (error)
            goto SysEntryError;

        devtmpfs_create_node(dev); //在/dev下创建设备文件
    }

    /* Notify clients of device addition.  This call must come
     * after dpm_sysfs_add() and before kobject_uevent().
     */
    if (dev->bus)
        blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
                         BUS_NOTIFY_ADD_DEVICE, dev);

    kobject_uevent(&dev->kobj, KOBJ_ADD); //发送设备ADD事件
    bus_probe_device(dev);  //尝试将device绑定到device_driver
    if (parent)  //如果指定了parent，将设备添加到parent的孩子链表中
        klist_add_tail(&dev->p->knode_parent,
                   &parent->p->klist_children);

    if (dev->class) {  //如果设置了class，将设备添加到类的设备链表
        mutex_lock(&dev->class->p->mutex);
        /* tie the class to the device */
        klist_add_tail(&dev->knode_class,
                   &dev->class->p->klist_devices);

        /* notify any interfaces that the device is here */
        list_for_each_entry(class_intf,  //调用device所属的class中所有class_interface的add_dev
                    &dev->class->p->interfaces, node)
            if (class_intf->add_dev)
                class_intf->add_dev(dev, class_intf);
        mutex_unlock(&dev->class->p->mutex);
    }
done:
    put_device(dev);
    return error;
 SysEntryError:
    if (MAJOR(dev->devt))
        device_remove_file(dev, &dev_attr_dev);
 DevAttrError:
    device_pm_remove(dev);
    dpm_sysfs_remove(dev);
 DPMError:
    bus_remove_device(dev);
 BusError:
    device_remove_attrs(dev);
 AttrsError:
    device_remove_class_symlinks(dev);
 SymlinkError:
    device_remove_file(dev, &dev_attr_uevent);
 attrError:
    kobject_uevent(&dev->kobj, KOBJ_REMOVE);
    kobject_del(&dev->kobj);
 Error:
    cleanup_device_parent(dev);
    put_device(parent);
name_error:
    kfree(dev->p);
    dev->p = NULL;
    goto done;
}
EXPORT_SYMBOL_GPL(device_add);

设备添加到系统主要流程都在device_add函数实现，上面代码的注释基本把主要函数的作用进行了描述。值得关注的一个函数便是bus_probe_device，该函数完成将设备绑定到驱动的动作。

void bus_probe_device(struct device *dev)
{
    struct bus_type *bus = dev->bus;
    struct subsys_interface *sif;

    if (!bus)
        return;

    if (bus->p->drivers_autoprobe) //如果bus允许自动探测
        device_initial_probe(dev); //主要功能

    mutex_lock(&bus->p->mutex);
    list_for_each_entry(sif, &bus->p->interfaces, node) //将设备绑定到接口
        if (sif->add_dev)
            sif->add_dev(dev, sif);
    mutex_unlock(&bus->p->mutex);
}

void device_initial_probe(struct device *dev)
{
    __device_attach(dev, true);
}

static int __device_attach(struct device *dev, bool allow_async)
{
    int ret = 0;

    device_lock(dev);
    if (dev->driver) {  //指定了device所要绑定的driver
        if (klist_node_attached(&dev->p->knode_driver)) { //检查knode_driver是否绑定到链表
            ret = 1;
            goto out_unlock;
        }
        ret = device_bind_driver(dev); //绑定，修改相应链表
        if (ret == 0)
            ret = 1;
        else {
            dev->driver = NULL;
            ret = 0;
        }
    } else {  //没有指定device要绑定的driver
        struct device_attach_data data = {
            .dev = dev,
            .check_async = allow_async,
            .want_async = false,
        };

        if (dev->parent)
            pm_runtime_get_sync(dev->parent);
        //遍历bus中所有驱动，尝试attach
        ret = bus_for_each_drv(dev->bus, NULL, &data,
                    __device_attach_driver);
        if (!ret && allow_async && data.have_async) {
            /*
             * If we could not find appropriate driver
             * synchronously and we are allowed to do
             * async probes and there are drivers that
             * want to probe asynchronously, we'll
             * try them.
             */
            dev_dbg(dev, "scheduling asynchronous probe\n");
            get_device(dev);
            async_schedule(__device_attach_async_helper, dev);
        } else {
            pm_request_idle(dev);
        }

        if (dev->parent)
            pm_runtime_put(dev->parent);
    }
out_unlock:
    device_unlock(dev);
    return ret;
}

通过上面3个函数的追踪，__device_attach函数遍历bus所有的驱动，尝试执行attach，具体调用__device_attach_driver函数。

static int __device_attach_driver(struct device_driver *drv, void *_data)
{
    struct device_attach_data *data = _data;
    struct device *dev = data->dev;
    bool async_allowed;

    /*
     * Check if device has already been claimed. This may
     * happen with driver loading, device discovery/registration,
     * and deferred probe processing happens all at once with
     * multiple threads.
     */
    if (dev->driver) 
        return -EBUSY;

    if (!driver_match_device(drv, dev))  //调用bus的match函数，测试是否匹配
        return 0;
    //进一步probe设备，需要设备已经注册
    async_allowed = driver_allows_async_probing(drv);

    if (async_allowed)
        data->have_async = true;
    //如果允许异步探测，则先返回
    if (data->check_async && async_allowed != data->want_async)
        return 0;

    return driver_probe_device(drv, dev);
}

int driver_probe_device(struct device_driver *drv, struct device *dev)
{
    int ret = 0;

    if (!device_is_registered(dev)) //检查device是否register
        return -ENODEV;

    pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
         drv->bus->name, __func__, dev_name(dev), drv->name);

    if (dev->parent)
        pm_runtime_get_sync(dev->parent);

    pm_runtime_barrier(dev);
    ret = really_probe(dev, drv); //真正执行探测
    pm_request_idle(dev);

    if (dev->parent)
        pm_runtime_put(dev->parent);

    return ret;
}

从上面两个函数来看，真正执行probe的函数是really_probe。

//返回1表示成功，返回0表示中间步骤出现异常，已回滚所有操作。
static int really_probe(struct device *dev, struct device_driver *drv)
{
    int ret = 0;
    int local_trigger_count = atomic_read(&deferred_trigger_count);

    atomic_inc(&probe_count);
    pr_debug("bus: '%s': %s: probing driver %s with device %s\n",
         drv->bus->name, __func__, drv->name, dev_name(dev));
    WARN_ON(!list_empty(&dev->devres_head));

    dev->driver = drv; //将设备的driver指向当前驱动

    /* If using pinctrl, bind pins now before probing */
    ret = pinctrl_bind_pins(dev);
    if (ret)
        goto probe_failed;

    if (driver_sysfs_add(dev)) {  //在sysfs驱动目录中创建指向设备的符号链接，同时在设备目录中创建指向驱动的符号链接
        printk(KERN_ERR "%s: driver_sysfs_add(%s) failed\n",
            __func__, dev_name(dev));
        goto probe_failed;
    }

    if (dev->pm_domain && dev->pm_domain->activate) {
        ret = dev->pm_domain->activate(dev);
        if (ret)
            goto probe_failed;
    }

    /*
     * Ensure devices are listed in devices_kset in correct order
     * It's important to move Dev to the end of devices_kset before
     * calling .probe, because it could be recursive and parent Dev
     * should always go first
     */
    devices_kset_move_last(dev);

    if (dev->bus->probe) {
        ret = dev->bus->probe(dev); //优先调用bus_type中的probe方法
        if (ret)
            goto probe_failed;
    } else if (drv->probe) {
        ret = drv->probe(dev);  //其次，调用driver中的probe方法
        if (ret)
            goto probe_failed;
    }

    pinctrl_init_done(dev);

    if (dev->pm_domain && dev->pm_domain->sync)
        dev->pm_domain->sync(dev);

    driver_bound(dev); //将设备链入驱动的设备链表
    ret = 1;
    pr_debug("bus: '%s': %s: bound device %s to driver %s\n",
         drv->bus->name, __func__, dev_name(dev), drv->name);
    goto done;

probe_failed:  //探测失败, 回滚操作
    devres_release_all(dev);
    driver_sysfs_remove(dev);
    dev->driver = NULL;
    dev_set_drvdata(dev, NULL);
    if (dev->pm_domain && dev->pm_domain->dismiss)
        dev->pm_domain->dismiss(dev);

    switch (ret) {
    case -EPROBE_DEFER:
        /* Driver requested deferred probing */
        dev_dbg(dev, "Driver %s requests probe deferral\n", drv->name);
        driver_deferred_probe_add(dev);
        /* Did a trigger occur while probing? Need to re-trigger if yes */
        if (local_trigger_count != atomic_read(&deferred_trigger_count))
            driver_deferred_probe_trigger();
        break;
    case -ENODEV:
    case -ENXIO:
        pr_debug("%s: probe of %s rejects match %d\n",
             drv->name, dev_name(dev), ret);
        break;
    default:
        /* driver matched but the probe failed */
        printk(KERN_WARNING
               "%s: probe of %s failed with error %d\n",
               drv->name, dev_name(dev), ret);
    }
    /*
     * Ignore errors returned by ->probe so that the next driver can try
     * its luck.
     */
    ret = 0;
done:
    atomic_dec(&probe_count);
    wake_up(&probe_waitqueue);
    return ret;
}

到此，设备添加到系统的主要流程便基本清楚，不再往下跟踪。

驱动

驱动数据结构

struct device_driver {
    const char      *name;     //driver名称
    struct bus_type     *bus;  //driver所属的bus_type

    struct module       *owner;
    const char      *mod_name;  /* used for built-in modules */

    bool suppress_bind_attrs;   /* disables bind/unbind via sysfs */
    enum probe_type probe_type;

    const struct of_device_id   *of_match_table;
    const struct acpi_device_id *acpi_match_table;

    int (*probe) (struct device *dev);  //在device绑定到driver之前，对device进行初始化
    int (*remove) (struct device *dev); //在device解绑到driver时，回调
    void (*shutdown) (struct device *dev);
    int (*suspend) (struct device *dev, pm_message_t state);
    int (*resume) (struct device *dev);
    const struct attribute_group **groups; //driver的属性

    const struct dev_pm_ops *pm; //电源相关

    struct driver_private *p;  //driver私有结构
};

struct driver_private {
    struct kobject kobj;
    struct klist klist_devices;   //driver所支持的device链表
    struct klist_node knode_bus;  //链入bus_type的驱动链表中
    struct module_kobject *mkobj;
    struct device_driver *driver;  //指向driver
};

device_driver结构中，bus域指向驱动所属的总线类型，knode_bus域用于链入总线类型的驱动链表。driver_private结构中的klist_devices域用于链接所有绑定到本驱动的设备。

驱动注册与反注册

驱动在加载时，需要将其注册到总线类型，调用driver_register实现：

int driver_register(struct device_driver *drv)
{
    int ret;
    struct device_driver *other;

    BUG_ON(!drv->bus->p); //确保bus已经注册到驱动模型中
    //如果bus_type和driver都实现了同一个回调，优先使用bus_type的回调函数，打印告警信息
    if ((drv->bus->probe && drv->probe) ||
        (drv->bus->remove && drv->remove) ||
        (drv->bus->shutdown && drv->shutdown))
        printk(KERN_WARNING "Driver '%s' needs updating - please use "
            "bus_type methods\n", drv->name);

    other = driver_find(drv->name, drv->bus); //根据名字查找驱动
    if (other) {
        printk(KERN_ERR "Error: Driver '%s' is already registered, "
            "aborting...\n", drv->name);
        return -EBUSY;
    }

    ret = bus_add_driver(drv); //将driver添加到bus
    if (ret)
        return ret;
    ret = driver_add_groups(drv, drv->groups); //创建driver属性文件
    if (ret) {
        bus_remove_driver(drv);
        return ret;
    }
    kobject_uevent(&drv->p->kobj, KOBJ_ADD); //发送ADD事件到用户空间

    return ret;
}
EXPORT_SYMBOL_GPL(driver_register);

添加driver到bus_type，由bus_add_driver实现：

int bus_add_driver(struct device_driver *drv)
{
    struct bus_type *bus;
    struct driver_private *priv;
    int error = 0;

    bus = bus_get(drv->bus);
    if (!bus)
        return -EINVAL;

    pr_debug("bus: '%s': add driver %s\n", bus->name, drv->name);

    priv = kzalloc(sizeof(*priv), GFP_KERNEL);  //分配driver_private结构空间
    if (!priv) {
        error = -ENOMEM;
        goto out_put_bus;
    }
    klist_init(&priv->klist_devices, NULL, NULL); //初始化driver设备链表
    priv->driver = drv; //关联device_driver和driver_private
    drv->p = priv;
    priv->kobj.kset = bus->p->drivers_kset; //driver_private中的kobj的kset域指向subsys中的drivers_kset
    error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL,  //添加driver到sysfs
                     "%s", drv->name);
    if (error)
        goto out_unregister;

    klist_add_tail(&priv->knode_bus, &bus->p->klist_drivers); //添加driver到bus的驱动链表中
    if (drv->bus->p->drivers_autoprobe) {  //自动探测
        if (driver_allows_async_probing(drv)) {  //允许异步执行probe
            pr_debug("bus: '%s': probing driver %s asynchronously\n",
                drv->bus->name, drv->name);
            async_schedule(driver_attach_async, drv); //异步probe
        } else {
            error = driver_attach(drv);  //同步probe
            if (error)
                goto out_unregister;
        }
    }
    module_add_driver(drv->owner, drv);  //驱动实现的模块

    error = driver_create_file(drv, &driver_attr_uevent);  //在driver中添加uevent属性文件
    if (error) {
        printk(KERN_ERR "%s: uevent attr (%s) failed\n",
            __func__, drv->name);
    }
    error = driver_add_groups(drv, bus->drv_groups);  //添加driver的属性文件
    if (error) {
        /* How the hell do we get out of this pickle? Give up */
        printk(KERN_ERR "%s: driver_create_groups(%s) failed\n",
            __func__, drv->name);
    }

    if (!drv->suppress_bind_attrs) {
        error = add_bind_files(drv);  //在driver目录添加的bind和unbind两个属性文件
        if (error) {
            /* Ditto */
            printk(KERN_ERR "%s: add_bind_files(%s) failed\n",
                __func__, drv->name);
        }
    }

    return 0;

out_unregister:
    kobject_put(&priv->kobj);
    kfree(drv->p);
    drv->p = NULL;
out_put_bus:
    bus_put(bus);
    return error;
}

bus_add_driver函数完成驱动添加到总线类型，当驱动添加完成后，如果总线类型设置了允许自动探测标志drivers_autoprobe，便可以根据是否允许异步探测调用driver_attach_async或driver_attach，driver_attach_async也是调用driver_attach:

int driver_attach(struct device_driver *drv)
{
    return bus_for_each_dev(drv->bus, NULL, drv, __driver_attach);
}
EXPORT_SYMBOL_GPL(driver_attach);

static int __driver_attach(struct device *dev, void *data)
{
    struct device_driver *drv = data;

    /*
     * Lock device and try to bind to it. We drop the error
     * here and always return 0, because we need to keep trying
     * to bind to devices and some drivers will return an error
     * simply if it didn't support the device.
     *
     * driver_probe_device() will spit a warning if there
     * is an error.
     */

    if (!driver_match_device(drv, dev)) //调用bus_type.match
        return 0;

    if (dev->parent)    /* Needed for USB */
        device_lock(dev->parent);
    device_lock(dev);
    if (!dev->driver)
        driver_probe_device(drv, dev); //完成probe的主要函数
    device_unlock(dev);
    if (dev->parent)
        device_unlock(dev->parent);

    return 0;
}

int driver_probe_device(struct device_driver *drv, struct device *dev)
{
    int ret = 0;

    if (!device_is_registered(dev)) //检查device是否register
        return -ENODEV;

    pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
         drv->bus->name, __func__, dev_name(dev), drv->name);

    if (dev->parent)
        pm_runtime_get_sync(dev->parent);

    pm_runtime_barrier(dev);
    ret = really_probe(dev, drv); //真正执行探测
    pm_request_idle(dev);

    if (dev->parent)
        pm_runtime_put(dev->parent);

    return ret;
}

根据上面3个函数，最终仍然是调用前面描述过的really_probe函数完成最后的探测。

到这里驱动注册完成，结合之前的设备注册流程，无论是驱动注册或是设备注册，只要总线类型设置了自动探测标志位，这两个流程都会执行探测。所以设备发现与驱动的加载顺序已经不再重要，也是通过这种双向探测方式，Linux内核支持设备的热拔插机制。

驱动卸载时，需要调用driver_unregister函数，使driver脱离总线类型：

void driver_unregister(struct device_driver *drv)
{
    if (!drv || !drv->p) {
        WARN(1, "Unexpected driver unregister!\n");
        return;
    }
    driver_remove_groups(drv, drv->groups); //删除驱动的属性文件
    bus_remove_driver(drv);                 //从总线类型中移除驱动
}
EXPORT_SYMBOL_GPL(driver_unregister);

void bus_remove_driver(struct device_driver *drv)
{
    if (!drv->bus)
        return;

    if (!drv->suppress_bind_attrs)
        remove_bind_files(drv);   //删除驱动目录下bind和unbind文件
    driver_remove_groups(drv, drv->bus->drv_groups); //删除总线类型的驱动属性文件
    driver_remove_file(drv, &driver_attr_uevent);    //删除驱动目录下uevent文件
    klist_remove(&drv->p->knode_bus); //从总线类型的驱动链表中移除驱动
    pr_debug("bus: '%s': remove driver %s\n", drv->bus->name, drv->name);
    driver_detach(drv);  //驱动与所有绑定的设备进行解绑
    module_remove_driver(drv);  //驱动实现的模块
    kobject_put(&drv->p->kobj); //减少引用计数
    bus_put(drv->bus);
}

类

类数据结构

struct class {
    const char      *name;       //类名称
    struct module       *owner;  //指向实现这个类的模块的指针

    struct class_attribute      *class_attrs;     //类公共属性
    const struct attribute_group    **dev_groups; //归属与该类的设备的默认属性
    struct kobject          *dev_kobj;            //类链入sysfs的kobject

    int (*dev_uevent)(struct device *dev, struct kobj_uevent_env *env); //发送事件前，设置类的特定环境变量
    char *(*devnode)(struct device *dev, umode_t *mode); //创建设备时，返回设备名称

    void (*class_release)(struct class *class); //类释放时回调
    void (*dev_release)(struct device *dev);    //设备释放时回调

    int (*suspend)(struct device *dev, pm_message_t state); //设备进入睡眠状态时，回调
    int (*resume)(struct device *dev);                      //设备被唤醒时，回调

    const struct kobj_ns_type_operations *ns_type;  //sysfs支持命名空间
    const void *(*namespace)(struct device *dev);   //返回设备所在的命名空间

    const struct dev_pm_ops *pm;  //电源相关

    struct subsys_private *p;     //类所属的子系统私有数据结构
};

类的私有数据类型与总线类型的私有数据类型都是subsys_private，这里将不再重复描述。

类注册与反注册

子系统需要使用类时，需要调用class_register函数向总线类型注册类：

#define class_register(class)           ({                          static struct lock_class_key __key;     __class_register(class, &__key);    })

int __class_register(struct class *cls, struct lock_class_key *key)
{
    struct subsys_private *cp;
    int error;

    pr_debug("device class '%s': registering\n", cls->name);

    cp = kzalloc(sizeof(*cp), GFP_KERNEL); //分配私有数据空间
    if (!cp)
        return -ENOMEM;
    klist_init(&cp->klist_devices, klist_class_dev_get, klist_class_dev_put); //初始化该class的device链表
    INIT_LIST_HEAD(&cp->interfaces);  //初始化接口链表
    kset_init(&cp->glue_dirs);
    __mutex_init(&cp->mutex, "subsys mutex", key);
    error = kobject_set_name(&cp->subsys.kobj, "%s", cls->name); //将在/sys/class/目录下显示该名称
    if (error) {
        kfree(cp);
        return error;
    }

    /* set the default /sys/dev directory for devices of this class */
    if (!cls->dev_kobj)
        cls->dev_kobj = sysfs_dev_char_kobj;

#if defined(CONFIG_BLOCK)
    /* let the block class directory show up in the root of sysfs */
    if (!sysfs_deprecated || cls != &block_class)
        cp->subsys.kobj.kset = class_kset;
#else
    cp->subsys.kobj.kset = class_kset;  // 全局变量class_kset指的是 /sys/class/
#endif
    cp->subsys.kobj.ktype = &class_ktype;
    cp->class = cls;  //class与subsys_private关联
    cls->p = cp;

    error = kset_register(&cp->subsys);  //在/sys/class/目录下创建该类对应的目录
    if (error) {
        kfree(cp);
        return error;
    }
    error = add_class_attrs(class_get(cls)); //在/sys/class/xxx/目录下创建类属性文件
    class_put(cls);
    return error;
}
EXPORT_SYMBOL_GPL(__class_register);

类的注册比较简单，注释已经比较详细。当子系统需要卸载类时，需要调用class_register函数：

void class_unregister(struct class *cls)
{
    pr_debug("device class '%s': unregistering\n", cls->name);
    remove_class_attrs(cls);            //删除/sys/class/xxx/目录下的类属性文件
    kset_unregister(&cls->p->subsys);   //删除/sys/class/目录
}

接口

接口数据结构

struct class_interface {
    struct list_head    node;    //链入class中
    struct class        *class;  //指向所属的class 
    //在接口被添加或者设备被添加到接口所在的类时，从接口中添加或删除设备
    int (*add_dev)      (struct device *, struct class_interface *);
    void (*remove_dev)  (struct device *, struct class_interface *);
};

接口注册与反注册

向类中注册接口，需要调用class_interface_register函数完成：

int class_interface_register(struct class_interface *class_intf)
{
    struct class *parent;
    struct class_dev_iter iter;
    struct device *dev;

    if (!class_intf || !class_intf->class)  //确保class和class_interface都存在
        return -ENODEV;

    parent = class_get(class_intf->class); //增加引用计数，并返回接口所属的class
    if (!parent)
        return -EINVAL;

    mutex_lock(&parent->p->mutex);
    list_add_tail(&class_intf->node, &parent->p->interfaces); //将class_interface添加到class的接口链表
    if (class_intf->add_dev) {  //如果接口设置了add_dev方法，对该class的所有device调用
        class_dev_iter_init(&iter, parent, NULL, NULL);
        while ((dev = class_dev_iter_next(&iter)))
            class_intf->add_dev(dev, class_intf);  //接口方法作用于设备
        class_dev_iter_exit(&iter);
    }
    mutex_unlock(&parent->p->mutex);

    return 0;
}

从类中删除接口，需要调用class_interface_unregister函数完成：

void class_interface_unregister(struct class_interface *class_intf)
{
    struct class *parent = class_intf->class;
    struct class_dev_iter iter;
    struct device *dev;

    if (!parent)
        return;

    mutex_lock(&parent->p->mutex);
    list_del_init(&class_intf->node); //将class_interface从class的接口链表中删除
    if (class_intf->remove_dev) { //如果接口设置了remove_dev方法，对该class的所有device调用
        class_dev_iter_init(&iter, parent, NULL, NULL);
        while ((dev = class_dev_iter_next(&iter)))
            class_intf->remove_dev(dev, class_intf);  //接口方法作用于设备
        class_dev_iter_exit(&iter);
    }
    mutex_unlock(&parent->p->mutex);

    class_put(parent);
}

基于设备驱动模型实现子系统

Linux设备驱动模型已经将每种对象的关系，sysfs的呈现方式已经实现了。实现子系统只需要定义业务自身的总线类型, 设备, 驱动, 类, 接口分别”继承”bus_type, device, driver, class, class_interface。并根据具体业务实现各个结构规定的回调函数。最后调用上述的注册函数添加到系统，便完成子系统的开发。

SCSI子系统之概述

Linux SCSI子系统的分层架构：

这里写图片描述

低层：代表与SCSI的物理接口的实际驱动器，例如各个厂商为其特定的主机适配器(Host Bus Adapter, HBA)开发的驱动，低层驱动主要作用是发现连接到主机适配器的scsi设备，在内存中构建scsi子系统所需的数据结构，并提供消息传递接口，将scsi命令的接受与发送解释为主机适配器的操作。
高层: 代表各种scsi设备类型的驱动，如scsi磁盘驱动，scsi磁带驱动，高层驱动认领低层驱动发现的scsi设备，为这些设备分配名称，将对设备的IO转换为scsi命令，交由低层驱动处理。
中层：包含scsi栈的公共服务函数。高层和低层通过调用中层的函数完成其功能，而中层在执行过程中，也需要调用高层和低层注册的回调函数做一些个性化处理。

Linux SCSI模型

这里写图片描述

Linux SCSI模型是内核的抽象，主机适配器连接主机IO总线(如PCI总线)和存储IO总线(如SCSI总线)。一台计算机可以有多个主机适配器，而主机适配器可以控制一条或多条SCSI总线，一条总线可以有多个目标节点与之相连，并且一个目标节点可以有多个逻辑单元。

在Linux SCSI子系统中，内核中的目标节点(target)对应SCSI磁盘，SCSI磁盘中可以有多个逻辑单元，统一由磁盘控制器控制，这些逻辑单元才是真正作为IO终点的存储设备，内核用设备(device)对逻辑单元进行抽象；内核中的Host对应主机适配器(物理的HBA/RAID卡，虚拟的iscsi target)

内核使用四元组来唯一标识一个scsi的逻辑单元，在sysfs中查看sda磁盘<2:0:0:0>显示如下：

root@ubuntu16:/home/comet/Costor/bin# ls /sys/bus/scsi/devices/2\:0\:0\:0/block/sda/
alignment_offset  device             events_poll_msecs  integrity  removable  sda5    subsystem
bdi               discard_alignment  ext_range          power      ro         size    trace
capability        events             holders            queue      sda1       slaves  uevent
dev               events_async       inflight           range      sda2       stat
root@ubuntu16:/home/comet/Costor/bin# cat /sys/bus/scsi/devices/2\:0\:0\:0/block/sda/dev
8:0
root@ubuntu16:/home/comet/Costor/bin# ll /dev/sda
brw-rw---- 1 root disk 8, 0 Sep 19 11:36 /dev/sda

host: 主机适配器的唯一编号。
channel: 主机适配器中scsi通道编号，由主机适配器固件维护。
id: 目标节点唯一标识符。
lun: 目标节点内逻辑单元编号。

SCSI命令

SCSI 命令是在 Command Descriptor Block (CDB) 中定义的。CDB 包含了用来定义要执行的特定操作的操作代码，以及大量特定于操作的参数。

命令	用途
Test unit ready	查询设备是否已经准备好进行传输
Inquiry	请求设备基本信息
Request sense	请求之前命令的错误信息
Read capacity	请求存储容量信息
Read	从设备读取数据
Write	向设备写入数据
Mode sense	请求模式页面（设备参数）
Mode select	在模式页面配置设备参数

借助大约 60 种可用命令，SCSI 可适用于许多设备（包括随机存取设备，比如磁盘和像磁带这样的顺序存储设备）。SCSI 也提供了专门的命令以访问箱体服务（比如存储箱体内部当前的传感和温度）。

核心数据结构

主机适配器模板scsi_host_template

主机适配器模板是相同型号主机适配器的公共内容，包括请求队列深度，SCSI命令处理回调函数，错误处理恢复函数。分配主机适配器结构时，需要使用主机适配器模板来赋值。在编写SCSI低层驱动时，第一步便是定义模板scsi_host_template，之后才能有模板生成主机适配器。

struct scsi_host_template {
    struct module *module;  //指向使用该模板实现的scsi_host，低层驱动模块。
    const char *name;       //主机适配器名称

    int (* detect)(struct scsi_host_template *);
    int (* release)(struct Scsi_Host *);

    const char *(* info)(struct Scsi_Host *); //返回HBA相关信息，可选实现

    int (* ioctl)(struct scsi_device *dev, int cmd, void __user *arg); //用户空间ioctl函数的实现，可选实现


#ifdef CONFIG_COMPAT
    //通过该函数，支持32位系统的用户态ioctl函数
    int (* compat_ioctl)(struct scsi_device *dev, int cmd, void __user *arg);
#endif

    //将scsi命令放进低层驱动的队列，由中间层调用，必须实现
    int (* queuecommand)(struct Scsi_Host *, struct scsi_cmnd *);

    //以下5个函数是错误处理回调函数,由中间层按照严重程度调用
    int (* eh_abort_handler)(struct scsi_cmnd *);        //Abort
    int (* eh_device_reset_handler)(struct scsi_cmnd *); //Device Reset
    int (* eh_target_reset_handler)(struct scsi_cmnd *); //Target Reset
    int (* eh_bus_reset_handler)(struct scsi_cmnd *);    //Bus Reset
    int (* eh_host_reset_handler)(struct scsi_cmnd *);   //Host Reset

    //当扫描到新磁盘时调用，中间层回调这个函数中可以分配和初始化低层驱动所需要的结构
    int (* slave_alloc)(struct scsi_device *)

//在设备受到INQUIRY命令后，执行相关的配置操作
    int (* slave_configure)(struct scsi_device *);

    //在scsi设备销毁之前调用，中间层回调用于释放slave_alloc分配的私有数据
    void (* slave_destroy)(struct scsi_device *);

    //当发现新的target，中间层调用，用户分配target私有数据
    int (* target_alloc)(struct scsi_target *);

    //在target被销毁之前，中间层调用，低层驱动实现，用于释放target_alloc分配的数据
    void (* target_destroy)(struct scsi_target *);

    //需要自定义扫描target逻辑时，中间层循环检查返回值，直到该函数返回1，表示扫描完成
    int (* scan_finished)(struct Scsi_Host *, unsigned long);

    //需要自定义扫描target逻辑时，扫描开始前回调
    void (* scan_start)(struct Scsi_Host *);

    //改变主机适配器的队列深度，返回设置的队列深度
    int (* change_queue_depth)(struct scsi_device *, int);

    //返回磁盘的BIOS参数，如size, device, list (heads, sectors, cylinders)
    int (* bios_param)(struct scsi_device *, struct block_device *,
            sector_t, int []);

    void (*unlock_native_capacity)(struct scsi_device *);

    //在procfs中的读写操作回调
    int (*show_info)(struct seq_file *, struct Scsi_Host *);
    int (*write_info)(struct Scsi_Host *, char *, int);

    //中间层发现scsi命令超时回调
    enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *);

    //通过sysfs属性reset主机适配器时，回调
    int (*host_reset)(struct Scsi_Host *shost, int reset_type);
#define SCSI_ADAPTER_RESET  1
#define SCSI_FIRMWARE_RESET 2

    const char *proc_name; //在proc文件系统的名称

    struct proc_dir_entry *proc_dir;

    int can_queue; //主机适配器能同时接受的命令数

    int this_id;

    /*
     * This determines the degree to which the host adapter is capable
     * of scatter-gather.
     */  //聚散列表的参数
    unsigned short sg_tablesize;
    unsigned short sg_prot_tablesize;

    /*
     * Set this if the host adapter has limitations beside segment count.
     */ //单个scsi命令能够访问的扇区最大数量
    unsigned int max_sectors;

    /*
     * DMA scatter gather segment boundary limit. A segment crossing this
     * boundary will be split in two.
     */
    unsigned long dma_boundary; //DMA聚散段边界值，超过该值将被切割成两个

#define SCSI_DEFAULT_MAX_SECTORS    1024

    short cmd_per_lun;

    /*
     * present contains counter indicating how many boards of this
     * type were found when we did the scan.
     */
    unsigned char present;

    /* If use block layer to manage tags, this is tag allocation policy */
    int tag_alloc_policy;

    /*
     * Track QUEUE_FULL events and reduce queue depth on demand.
     */
    unsigned track_queue_depth:1;

    /*
     * This specifies the mode that a LLD supports.
     */
    unsigned supported_mode:2; //低层驱动支持的模式(initiator或target)

    /*
     * True if this host adapter uses unchecked DMA onto an ISA bus.
     */
    unsigned unchecked_isa_dma:1;

    unsigned use_clustering:1;

    /*
     * True for emulated SCSI host adapters (e.g. ATAPI).
     */
    unsigned emulated:1;

    /*
     * True if the low-level driver performs its own reset-settle delays.
     */
    unsigned skip_settle_delay:1;

    /* True if the controller does not support WRITE SAME */
    unsigned no_write_same:1;

    /*
     * True if asynchronous aborts are not supported
     */
    unsigned no_async_abort:1;

    /*
     * Countdown for host blocking with no commands outstanding.
     */
    unsigned int max_host_blocked; //主机适配器发送队列的低阀值，允许累计多个命令同时派发

#define SCSI_DEFAULT_HOST_BLOCKED   7

    /*
     * Pointer to the sysfs class properties for this host, NULL terminated.
     */
    struct device_attribute **shost_attrs; //主机适配器类属性

    /*
     * Pointer to the SCSI device properties for this host, NULL terminated.
     */
    struct device_attribute **sdev_attrs;  //主机适配器设备属性

    struct list_head legacy_hosts;

    u64 vendor_id;

    /*
     * Additional per-command data allocated for the driver.
     */  //scsi 命令缓冲池，scsi命令都是预先分配好的，保存在cmd_pool中
    unsigned int cmd_size;
    struct scsi_host_cmd_pool *cmd_pool;

    /* temporary flag to disable blk-mq I/O path */
    bool disable_blk_mq;  //禁用通用块层多队列模式标志
};

主机适配器Scsi_Host

Scsi_Host描述一个SCSI主机适配器，SCSI主机适配器通常是一块基于PCI总线的扩展卡或是一个SCSI控制器芯片。每个SCSI主机适配器可以存在多个通道，一个通道实际扩展了一条SCSI总线。每个通过可以连接多个SCSI目标节点，具体连接数量与SCSI总线带载能力有关，或者受具体SCSI协议的限制。真实的主机总线适配器是接入主机IO总线上(通常是PCI总线)，在系统启动时，会扫描挂载在PCI总线上的设备，此时会分配主机总线适配器。
Scsi_Host结构包含内嵌通用设备，将被链入SCSI总线类型(scsi_bus_type)的设备链表。

struct Scsi_Host {
    struct list_head    __devices; //设备链表
    struct list_head    __targets; //目标节点链表

    struct scsi_host_cmd_pool *cmd_pool; //scsi命令缓冲池
    spinlock_t      free_list_lock;   //保护free_list
    struct list_head    free_list; /* backup store of cmd structs, scsi命令预先分配的备用命令链表 */
    struct list_head    starved_list; //scsi命令的饥饿链表

    spinlock_t      default_lock;
    spinlock_t      *host_lock;

    struct mutex        scan_mutex;/* serialize scanning activity */

    struct list_head    eh_cmd_q; //执行错误的scsi命令的链表
    struct task_struct    * ehandler;  /* Error recovery thread. 错误恢复线程 */
    struct completion     * eh_action; /* Wait for specific actions on the
                          host. */
    wait_queue_head_t       host_wait; //scsi设备恢复等待队列
    struct scsi_host_template *hostt;  //主机适配器模板
    struct scsi_transport_template *transportt; //指向SCSI传输层模板

    /*
     * Area to keep a shared tag map (if needed, will be
     * NULL if not).
     */
    union {
        struct blk_queue_tag    *bqt;
        struct blk_mq_tag_set   tag_set; //SCSI支持多队列时使用
    };
    //已经派发给主机适配器(低层驱动)的scsi命令数
    atomic_t host_busy;        /* commands actually active on low-level */
    atomic_t host_blocked;  //阻塞的scsi命令数

    unsigned int host_failed;      /* commands that failed.
                          protected by host_lock */
    unsigned int host_eh_scheduled;    /* EH scheduled without command */

    unsigned int host_no;  /* Used for IOCTL_GET_IDLUN, /proc/scsi et al. 系统内唯一标识 */

    /* next two fields are used to bound the time spent in error handling */
    int eh_deadline;
    unsigned long last_reset; //记录上次reset时间


    /*
     * These three parameters can be used to allow for wide scsi,
     * and for host adapters that support multiple busses
     * The last two should be set to 1 more than the actual max id
     * or lun (e.g. 8 for SCSI parallel systems).
     */
    unsigned int max_channel; //主机适配器的最大通道编号
    unsigned int max_id;      //主机适配器目标节点最大编号
    u64 max_lun;              //主机适配器lun最大编号

    unsigned int unique_id;

    /*
     * The maximum length of SCSI commands that this host can accept.
     * Probably 12 for most host adapters, but could be 16 for others.
     * or 260 if the driver supports variable length cdbs.
     * For drivers that don't set this field, a value of 12 is
     * assumed.
     */
    unsigned short max_cmd_len;  //主机适配器可以接受的最长的SCSI命令
    //下面这段在scsi_host_template中也有，由template中的字段赋值
    int this_id;
    int can_queue;
    short cmd_per_lun;
    short unsigned int sg_tablesize;
    short unsigned int sg_prot_tablesize;
    unsigned int max_sectors;
    unsigned long dma_boundary;
    /*
     * In scsi-mq mode, the number of hardware queues supported by the LLD.
     *
     * Note: it is assumed that each hardware queue has a queue depth of
     * can_queue. In other words, the total queue depth per host
     * is nr_hw_queues * can_queue.
     */
    unsigned nr_hw_queues; //在scsi-mq模式中，低层驱动所支持的硬件队列的数量
    /*
     * Used to assign serial numbers to the cmds.
     * Protected by the host lock.
     */
    unsigned long cmd_serial_number;  //指向命令序列号unsigned active_mode:2;           //标识是initiator或target
    unsigned unchecked_isa_dma:1;
    unsigned use_clustering:1;

    /*
     * Host has requested that no further requests come through for the
     * time being.
     */
    unsigned host_self_blocked:1; //表示低层驱动要求阻塞该主机适配器，此时中间层不会继续派发命令到主机适配器队列中

    /*
     * Host uses correct SCSI ordering not PC ordering. The bit is
     * set for the minority of drivers whose authors actually read
     * the spec ;).
     */
    unsigned reverse_ordering:1;

    /* Task mgmt function in progress */
    unsigned tmf_in_progress:1;  //任务管理函数正在执行

    /* Asynchronous scan in progress */
    unsigned async_scan:1;       //异步扫描正在执行

    /* Don't resume host in EH */
    unsigned eh_noresume:1;      //在错误处理过程不恢复主机适配器

    /* The controller does not support WRITE SAME */
    unsigned no_write_same:1;

    unsigned use_blk_mq:1;       //是否使用SCSI多队列模式
    unsigned use_cmd_list:1;

    /* Host responded with short (<36 bytes) INQUIRY result */
    unsigned short_inquiry:1;

    /*
     * Optional work queue to be utilized by the transport
     */
    char work_q_name[20];  //被scsi传输层使用的工作队列
    struct workqueue_struct *work_q;

    /*
     * Task management function work queue
     */
    struct workqueue_struct *tmf_work_q; //任务管理函数工作队列

    /* The transport requires the LUN bits NOT to be stored in CDB[1] */
    unsigned no_scsi2_lun_in_cdb:1;

    /*
     * Value host_blocked counts down from
     */
    unsigned int max_host_blocked; //在派发队列中累计命令达到这个数值，才开始唤醒主机适配器

    /* Protection Information */
    unsigned int prot_capabilities;
    unsigned char prot_guard_type;

    /*
     * q used for scsi_tgt msgs, async events or any other requests that
     * need to be processed in userspace
     */
    struct request_queue *uspace_req_q; //需要在用户空间处理的scsi_tgt消息、异步事件或其他请求的请求队列

    /* legacy crap */
    unsigned long base;
    unsigned long io_port;   //I/O端口编号
    unsigned char n_io_port;
    unsigned char dma_channel;
    unsigned int  irq;


    enum scsi_host_state shost_state; //状态

    /* ldm bits */ //shost_gendev: 内嵌通用设备，SCSI设备通过这个域链入SCSI总线类型(scsi_bus_type)的设备链表
    struct device       shost_gendev, shost_dev;
    //shost_dev: 内嵌类设备, SCSI设备通过这个域链入SCSI主机适配器类型(shost_class)的设备链表
    /*
     * List of hosts per template.
     *
     * This is only for use by scsi_module.c for legacy templates.
     * For these access to it is synchronized implicitly by
     * module_init/module_exit.
     */
    struct list_head sht_legacy_list;

    /*
     * Points to the transport data (if any) which is allocated
     * separately
     */
    void *shost_data; //指向独立分配的传输层数据，由SCSI传输层使用

    /*
     * Points to the physical bus device we'd use to do DMA
     * Needed just in case we have virtual hosts.
     */
    struct device *dma_dev;

    /*
     * We should ensure that this is aligned, both for better performance
     * and also because some compilers (m68k) don't automatically force
     * alignment to a long boundary.
     */ //主机适配器专有数据
    unsigned long hostdata[0]  /* Used for storage of host specific stuff */
        __attribute__ ((aligned (sizeof(unsigned long))));
};

目标节点scsi_target

scsi_target结构中有一个内嵌驱动模型设备,被链入SCSI总线类型scsi_bus_type的设备链表。

struct scsi_target {
    struct scsi_device  *starget_sdev_user; //指向正在进行I/O的scsi设备，没有IO则指向NULL
    struct list_head    siblings;  //链入主机适配器target链表中
    struct list_head    devices;   //属于该target的device链表
    struct device       dev;       //通用设备,用于加入设备驱动模型
    struct kref     reap_ref; /* last put renders target invisible 本结构的引用计数 */
    unsigned int        channel;   //该target所在的channel号
    unsigned int        id; /* target id ... replace
                     * scsi_device.id eventually */
    unsigned int        create:1; /* signal that it needs to be added */
    unsigned int        single_lun:1;   /* Indicates we should only
                         * allow I/O to one of the luns
                         * for the device at a time. */
    unsigned int        pdt_1f_for_no_lun:1;    /* PDT = 0x1f
                         * means no lun present. */
    unsigned int        no_report_luns:1;   /* Don't use
                         * REPORT LUNS for scanning. */
    unsigned int        expecting_lun_change:1; /* A device has reported
                         * a 3F/0E UA, other devices on
                         * the same target will also. */
    /* commands actually active on LLD. */
    atomic_t        target_busy;
    atomic_t        target_blocked;           //当前阻塞的命令数

    /*
     * LLDs should set this in the slave_alloc host template callout.
     * If set to zero then there is not limit.
     */
    unsigned int        can_queue;             //同时处理的命令数
    unsigned int        max_target_blocked;    //阻塞命令数阀值
#define SCSI_DEFAULT_TARGET_BLOCKED 3

    char            scsi_level;                //支持的SCSI规范级别
    enum scsi_target_state  state;             //target状态
    void            *hostdata; /* available to low-level driver */
    unsigned long       starget_data[0]; /* for the transport SCSI传输层(中间层)使用 */
    /* starget_data must be the last element!!!! */
} __attribute__((aligned(sizeof(unsigned long))));

逻辑设备scsi_device

scsi_device描述scsi逻辑设备，代表scsi磁盘的逻辑单元lun。scsi_device描述符所代表的设备可能是另一台存储设备上的SATA/SAS/SCSI磁盘或SSD。操作系统在扫描到连接在主机适配器上的逻辑设备时，创建scsi_device结构，用于scsi高层驱动和该设备通信。

struct scsi_device {
    struct Scsi_Host *host;  //所归属的主机总线适配器
    struct request_queue *request_queue; //请求队列

    /* the next two are protected by the host->host_lock */
    struct list_head    siblings;   /* list of all devices on this host */ //链入主机总线适配器设备链表
    struct list_head    same_target_siblings; /* just the devices sharing same target id */ //链入target的设备链表

    atomic_t device_busy;       /* commands actually active on LLDD */
    atomic_t device_blocked;    /* Device returned QUEUE_FULL. */

    spinlock_t list_lock;
    struct list_head cmd_list;  /* queue of in use SCSI Command structures */
    struct list_head starved_entry; //链入主机适配器的"饥饿"链表
    struct scsi_cmnd *current_cmnd; /* currently active command */ //当前正在执行的命令
    unsigned short queue_depth; /* How deep of a queue we want */
    unsigned short max_queue_depth; /* max queue depth */
    unsigned short last_queue_full_depth; /* These two are used by */
    unsigned short last_queue_full_count; /* scsi_track_queue_full() */
    unsigned long last_queue_full_time; /* last queue full time */
    unsigned long queue_ramp_up_period; /* ramp up period in jiffies */
#define SCSI_DEFAULT_RAMP_UP_PERIOD (120 * HZ)

    unsigned long last_queue_ramp_up;   /* last queue ramp up time */

    unsigned int id, channel; //scsi_device所属的target id和所在channel通道号
    u64 lun;  //该设备的lun编号
    unsigned int manufacturer;  /* Manufacturer of device, for using  制造商
                     * vendor-specific cmd's */
    unsigned sector_size;   /* size in bytes 硬件的扇区大小 */

    void *hostdata;     /* available to low-level driver 专有数据 */
    char type;          //SCSI设备类型
    char scsi_level;    //所支持SCSI规范的版本号，由INQUIRY命令获得
    char inq_periph_qual;   /* PQ from INQUIRY data */
    unsigned char inquiry_len;  /* valid bytes in 'inquiry' */
    unsigned char * inquiry;    /* INQUIRY response data */
    const char * vendor;        /* [back_compat] point into 'inquiry' ... */
    const char * model;     /* ... after scan; point to static string */
    const char * rev;       /* ... "nullnullnullnull" before scan */

#define SCSI_VPD_PG_LEN                255
    int vpd_pg83_len;          //sense命令 0x83
    unsigned char *vpd_pg83;
    int vpd_pg80_len;          //sense命令 0x80
    unsigned char *vpd_pg80;
    unsigned char current_tag;  /* current tag */
    struct scsi_target      *sdev_target;   /* used only for single_lun */

    unsigned int    sdev_bflags; /* black/white flags as also found in
                 * scsi_devinfo.[hc]. For now used only to
                 * pass settings from slave_alloc to scsi
                 * core. */
    unsigned int eh_timeout; /* Error handling timeout */
    unsigned removable:1;
    unsigned changed:1; /* Data invalid due to media change */
    unsigned busy:1;    /* Used to prevent races */
    unsigned lockable:1;    /* Able to prevent media removal */
    unsigned locked:1;      /* Media removal disabled */
    unsigned borken:1;  /* Tell the Seagate driver to be
                 * painfully slow on this device */
    unsigned disconnect:1;  /* can disconnect */
    unsigned soft_reset:1;  /* Uses soft reset option */
    unsigned sdtr:1;    /* Device supports SDTR messages 支持同步数据传输 */
    unsigned wdtr:1;    /* Device supports WDTR messages 支持16位宽数据传输*/
    unsigned ppr:1;     /* Device supports PPR messages 支持PPR(并行协议请求)消息*/
    unsigned tagged_supported:1;    /* Supports SCSI-II tagged queuing */
    unsigned simple_tags:1; /* simple queue tag messages are enabled */
    unsigned was_reset:1;   /* There was a bus reset on the bus for
                 * this device */
    unsigned expecting_cc_ua:1; /* Expecting a CHECK_CONDITION/UNIT_ATTN
                     * because we did a bus reset. */
    unsigned use_10_for_rw:1; /* first try 10-byte read / write */
    unsigned use_10_for_ms:1; /* first try 10-byte mode sense/select */
    unsigned no_report_opcodes:1;   /* no REPORT SUPPORTED OPERATION CODES */
    unsigned no_write_same:1;   /* no WRITE SAME command */
    unsigned use_16_for_rw:1; /* Use read/write(16) over read/write(10) */
    unsigned skip_ms_page_8:1;  /* do not use MODE SENSE page 0x08 */
    unsigned skip_ms_page_3f:1; /* do not use MODE SENSE page 0x3f */
    unsigned skip_vpd_pages:1;  /* do not read VPD pages */
    unsigned try_vpd_pages:1;   /* attempt to read VPD pages */
    unsigned use_192_bytes_for_3f:1; /* ask for 192 bytes from page 0x3f */
    unsigned no_start_on_add:1; /* do not issue start on add */
    unsigned allow_restart:1; /* issue START_UNIT in error handler */
    unsigned manage_start_stop:1;   /* Let HLD (sd) manage start/stop */
    unsigned start_stop_pwr_cond:1; /* Set power cond. in START_STOP_UNIT */
    unsigned no_uld_attach:1; /* disable connecting to upper level drivers */
    unsigned select_no_atn:1;
    unsigned fix_capacity:1;    /* READ_CAPACITY is too high by 1 */
    unsigned guess_capacity:1;  /* READ_CAPACITY might be too high by 1 */
    unsigned retry_hwerror:1;   /* Retry HARDWARE_ERROR */
    unsigned last_sector_bug:1; /* do not use multisector accesses on
                       SD_LAST_BUGGY_SECTORS */
    unsigned no_read_disc_info:1;   /* Avoid READ_DISC_INFO cmds */
    unsigned no_read_capacity_16:1; /* Avoid READ_CAPACITY_16 cmds */
    unsigned try_rc_10_first:1; /* Try READ_CAPACACITY_10 first */
    unsigned is_visible:1;  /* is the device visible in sysfs */
    unsigned wce_default_on:1;  /* Cache is ON by default */
    unsigned no_dif:1;  /* T10 PI (DIF) should be disabled */
    unsigned broken_fua:1;      /* Don't set FUA bit */
    unsigned lun_in_cdb:1;      /* Store LUN bits in CDB[1] */

    atomic_t disk_events_disable_depth; /* disable depth for disk events */

    DECLARE_BITMAP(supported_events, SDEV_EVT_MAXBITS); /* supported events */
    DECLARE_BITMAP(pending_events, SDEV_EVT_MAXBITS); /* pending events */
    struct list_head event_list;    /* asserted events */
    struct work_struct event_work;

    unsigned int max_device_blocked; /* what device_blocked counts down from  */
#define SCSI_DEFAULT_DEVICE_BLOCKED 3

    atomic_t iorequest_cnt;
    atomic_t iodone_cnt;
    atomic_t ioerr_cnt;

    struct device       sdev_gendev, //内嵌通用设备, 链入scsi总线类型(scsi_bus_type)的设备链表
                sdev_dev; //内嵌类设备，链入scsi设备类(sdev_class)的设备链表

    struct execute_work ew; /* used to get process context on put */
    struct work_struct  requeue_work;

    struct scsi_device_handler *handler; //自定义设备处理函数
    void            *handler_data;

    enum scsi_device_state sdev_state;  //scsi设备状态
    unsigned long       sdev_data[0];   //scsi传输层使用
} __attribute__((aligned(sizeof(unsigned long))));

内核定义的SCSI命令结构scsi_cmnd

scsi_cmnd结构有SCSI中间层创建，传递到SCSI低层驱动。每个IO请求会被创建一个scsi_cnmd，但scsi_cmnd并不一定是时IO请求。scsi_cmnd最终转化成一个具体的SCSI命令。除了命令描述块之外,scsi_cmnd包含更丰富的信息，包括数据缓冲区、感测数据缓冲区、完成回调函数以及所关联的块设备驱动层请求等，是SCSI中间层执行SCSI命令的上下文。

struct scsi_cmnd {
    struct scsi_device *device;  //指向命令所属SCSI设备的描述符的指针
    struct list_head list;  /* scsi_cmnd participates in queue lists 链入scsi设备的命令链表 */
    struct list_head eh_entry; /* entry for the host eh_cmd_q */
    struct delayed_work abort_work;
    int eh_eflags;      /* Used by error handlr */

    /*
     * A SCSI Command is assigned a nonzero serial_number before passed
     * to the driver's queue command function.  The serial_number is
     * cleared when scsi_done is entered indicating that the command
     * has been completed.  It is a bug for LLDDs to use this number
     * for purposes other than printk (and even that is only useful
     * for debugging).
     */
    unsigned long serial_number; //scsi命令的唯一序号

    /*
     * This is set to jiffies as it was when the command was first
     * allocated.  It is used to time how long the command has
     * been outstanding
     */
    unsigned long jiffies_at_alloc; //分配时的jiffies, 用于计算命令处理时间

    int retries;  //命令重试次数
    int allowed;  //允许的重试次数

    unsigned char prot_op;    //保护操作(DIF和DIX)
    unsigned char prot_type;  //DIF保护类型
    unsigned char prot_flags;

    unsigned short cmd_len;   //命令长度
    enum dma_data_direction sc_data_direction;  //命令传输方向

    /* These elements define the operation we are about to perform */
    unsigned char *cmnd;  //scsi规范格式的命令字符串


    /* These elements define the operation we ultimately want to perform */
    struct scsi_data_buffer sdb;        //scsi命令数据缓冲区
    struct scsi_data_buffer *prot_sdb;  //scsi命令保护信息缓冲区

    unsigned underflow; /* Return error if less than
                   this amount is transferred */

    unsigned transfersize;  /* How much we are guaranteed to  //传输单位
                   transfer with each SCSI transfer
                   (ie, between disconnect /
                   reconnects.   Probably == sector
                   size */

    struct request *request;    /* The command we are  通用块层的请求描述符
                       working on */

#define SCSI_SENSE_BUFFERSIZE   96
    unsigned char *sense_buffer;    //scsi命令感测数据缓冲区
                /* obtained by REQUEST SENSE when
                 * CHECK CONDITION is received on original
                 * command (auto-sense) */

    /* Low-level done function - can be used by low-level driver to point
     *        to completion function.  Not used by mid/upper level code. */
    void (*scsi_done) (struct scsi_cmnd *); //scsi命令在低层驱动完成时，回调

    /*
     * The following fields can be written to by the host specific code.
     * Everything else should be left alone.
     */
    struct scsi_pointer SCp;    /* Scratchpad used by some host adapters */

    unsigned char *host_scribble;   /* The host adapter is allowed to
                     * call scsi_malloc and get some memory
                     * and hang it here.  The host adapter
                     * is also expected to call scsi_free
                     * to release this memory.  (The memory
                     * obtained by scsi_malloc is guaranteed
                     * to be at an address < 16Mb). */

    int result;     /* Status code from lower level driver */
    int flags;      /* Command flags */

    unsigned char tag;  /* SCSI-II queued command tag */
};

驱动scsi_driver

struct scsi_driver {
    struct device_driver    gendrv;  // "继承"device_driver

    void (*rescan)(struct device *); //重新扫描前调用的回调函数
    int (*init_command)(struct scsi_cmnd *);
    void (*uninit_command)(struct scsi_cmnd *);
    int (*done)(struct scsi_cmnd *);  //当低层驱动完成一个scsi命令时调用，用于计算已经完成的字节数
    int (*eh_action)(struct scsi_cmnd *, int); //错误处理回调
};

设备模型

scsi_bus_type: scsi子系统总线类型

struct bus_type scsi_bus_type = {
        .name       = "scsi",   // 对应/sys/bus/scsi
        .match      = scsi_bus_match,
    .uevent     = scsi_bus_uevent,
#ifdef CONFIG_PM
    .pm     = &scsi_bus_pm_ops,
#endif
};
EXPORT_SYMBOL_GPL(scsi_bus_type);

shost_class: scsi子系统类

static struct class shost_class = {
    .name       = "scsi_host",  // 对应/sys/class/scsi_host
    .dev_release    = scsi_host_cls_release,
};

这里写图片描述

初始化过程

操作系统启动时，会加载scsi子系统，入口函数是init_scsi，使用subsys_initcall定义：

static int __init init_scsi(void)
{
    int error;

    error = scsi_init_queue();  //初始化聚散列表所需要的存储池
    if (error)
        return error;
    error = scsi_init_procfs(); //初始化procfs中与scsi相关的目录项
    if (error)
        goto cleanup_queue;
    error = scsi_init_devinfo();//设置scsi动态设备信息列表
    if (error)
        goto cleanup_procfs;
    error = scsi_init_hosts();  //注册shost_class类，在/sys/class/目录下创建scsi_host子目录
    if (error)
        goto cleanup_devlist;
    error = scsi_init_sysctl(); //注册SCSI系统控制表
    if (error)
        goto cleanup_hosts;
    error = scsi_sysfs_register(); //注册scsi_bus_type总线类型和sdev_class类
    if (error)
        goto cleanup_sysctl;

    scsi_netlink_init(); //初始化SCSI传输netlink接口

    printk(KERN_NOTICE "SCSI subsystem initialized\n");
    return 0;

cleanup_sysctl:
    scsi_exit_sysctl();
cleanup_hosts:
    scsi_exit_hosts();
cleanup_devlist:
    scsi_exit_devinfo();
cleanup_procfs:
    scsi_exit_procfs();
cleanup_queue:
    scsi_exit_queue();
    printk(KERN_ERR "SCSI subsystem failed to initialize, error = %d\n",
           -error);
    return error;
}

scsi_init_hosts函数初始化scsi子系统主机适配器所属的类shost_class：

int scsi_init_hosts(void)
{
    return class_register(&shost_class);
}

scsi_sysfs_register函数初始化scsi子系统总线类型scsi_bus_type和设备所属的类sdev_class类：

int scsi_sysfs_register(void)
{
    int error;

    error = bus_register(&scsi_bus_type);
    if (!error) {
        error = class_register(&sdev_class);
        if (error)
            bus_unregister(&scsi_bus_type);
    }

    return error;
}

scsi低层驱动是面向主机适配器的，低层驱动被加载时，需要添加主机适配器。主机适配器添加有两种方式：1.在PCI子系统扫描挂载驱动时添加；2.手动方式添加。所有基于硬件PCI接口的主机适配器都采用第一种方式。添加主机适配器包括两个步骤：
1. 分别主机适配器数据结构scsi_host_alloc
2. 将主机适配器添加到系统scsi_add_host

struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
{
    struct Scsi_Host *shost;
    gfp_t gfp_mask = GFP_KERNEL;

    if (sht->unchecked_isa_dma && privsize)
        gfp_mask |= __GFP_DMA;
    //一次分配Scsi_Host和私有数据空间
    shost = kzalloc(sizeof(struct Scsi_Host) + privsize, gfp_mask);
    if (!shost)
        return NULL;

    shost->host_lock = &shost->default_lock;
    spin_lock_init(shost->host_lock);
    shost->shost_state = SHOST_CREATED; //更新状态
    INIT_LIST_HEAD(&shost->__devices);  //初始化scsi设备链表
    INIT_LIST_HEAD(&shost->__targets);  //初始化target链表
    INIT_LIST_HEAD(&shost->eh_cmd_q);   //初始化执行错误的scsi命令链表
    INIT_LIST_HEAD(&shost->starved_list);   //初始化scsi命令饥饿链表
    init_waitqueue_head(&shost->host_wait);
    mutex_init(&shost->scan_mutex);

    /*
     * subtract one because we increment first then return, but we need to
     * know what the next host number was before increment
     */ //递增分配主机适配器号
    shost->host_no = atomic_inc_return(&scsi_host_next_hn) - 1;
    shost->dma_channel = 0xff;

    /* These three are default values which can be overridden */
    shost->max_channel = 0; //默认通道号为0
    shost->max_id = 8;      //默认target最大数量
    shost->max_lun = 8;     //默认scsi_device最大数量

    /* Give each shost a default transportt */
    shost->transportt = &blank_transport_template;  //scsi传输层(中间层)模板

    /*
     * All drivers right now should be able to handle 12 byte
     * commands.  Every so often there are requests for 16 byte
     * commands, but individual low-level drivers need to certify that
     * they actually do something sensible with such commands.
     */
    shost->max_cmd_len = 12;  //最长的SCSI命令长度
    shost->hostt = sht;       //使用主机适配器模板
    shost->this_id = sht->this_id;
    shost->can_queue = sht->can_queue;
    shost->sg_tablesize = sht->sg_tablesize;
    shost->sg_prot_tablesize = sht->sg_prot_tablesize;
    shost->cmd_per_lun = sht->cmd_per_lun;
    shost->unchecked_isa_dma = sht->unchecked_isa_dma;
    shost->use_clustering = sht->use_clustering;
    shost->no_write_same = sht->no_write_same;

    if (shost_eh_deadline == -1 || !sht->eh_host_reset_handler)
        shost->eh_deadline = -1;
    else if ((ulong) shost_eh_deadline * HZ > INT_MAX) {
        shost_printk(KERN_WARNING, shost,
                 "eh_deadline %u too large, setting to %u\n",
                 shost_eh_deadline, INT_MAX / HZ);
        shost->eh_deadline = INT_MAX;
    } else
        shost->eh_deadline = shost_eh_deadline * HZ;

    if (sht->supported_mode == MODE_UNKNOWN) //由模板指定HBA的模式
        /* means we didn't set it ... default to INITIATOR */
        shost->active_mode = MODE_INITIATOR;  //主机适配器模式默认是initiator
    else
        shost->active_mode = sht->supported_mode;

    if (sht->max_host_blocked)
        shost->max_host_blocked = sht->max_host_blocked;
    else
        shost->max_host_blocked = SCSI_DEFAULT_HOST_BLOCKED;

    /*
     * If the driver imposes no hard sector transfer limit, start at
     * machine infinity initially.
     */
    if (sht->max_sectors)
        shost->max_sectors = sht->max_sectors;
    else
        shost->max_sectors = SCSI_DEFAULT_MAX_SECTORS;

    /*
     * assume a 4GB boundary, if not set
     */
    if (sht->dma_boundary)
        shost->dma_boundary = sht->dma_boundary;
    else
        shost->dma_boundary = 0xffffffff;  //默认DMA的边界为4G

    shost->use_blk_mq = scsi_use_blk_mq && !shost->hostt->disable_blk_mq;

    device_initialize(&shost->shost_gendev); //初始化主机适配器内部通用设备
    dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
    shost->shost_gendev.bus = &scsi_bus_type;   //设置主机适配器的总线类型
    shost->shost_gendev.type = &scsi_host_type; //设置主机适配器的设备类型

    device_initialize(&shost->shost_dev);    //初始化主机适配器的内部类设备
    shost->shost_dev.parent = &shost->shost_gendev; //内部类设备的父设备设置为其内部通用设备
    shost->shost_dev.class = &shost_class;   //设置内部类设备所属的类是shost_class
    dev_set_name(&shost->shost_dev, "host%d", shost->host_no);
    shost->shost_dev.groups = scsi_sysfs_shost_attr_groups;  //设置类设备的属性组

    shost->ehandler = kthread_run(scsi_error_handler, shost,  //启动主机适配器的错误恢复内核线程
            "scsi_eh_%d", shost->host_no);
    if (IS_ERR(shost->ehandler)) {
        shost_printk(KERN_WARNING, shost,
            "error handler thread failed to spawn, error = %ld\n",
            PTR_ERR(shost->ehandler));
        goto fail_kfree;
    }
    //分配任务管理工作队列
    shost->tmf_work_q = alloc_workqueue("scsi_tmf_%d",
                        WQ_UNBOUND | WQ_MEM_RECLAIM,
                       1, shost->host_no);
    if (!shost->tmf_work_q) {
        shost_printk(KERN_WARNING, shost,
                 "failed to create tmf workq\n");
        goto fail_kthread;
    }
    scsi_proc_hostdir_add(shost->hostt); //在procfs中添加主机适配器的目录, eg. //创建/proc/scsi/<主机适配器名称>目录
    return shost;

 fail_kthread:
    kthread_stop(shost->ehandler);
 fail_kfree:
    kfree(shost);
    return NULL;
}
EXPORT_SYMBOL(scsi_host_alloc);

static inline int __must_check scsi_add_host(struct Scsi_Host *host,
                         struct device *dev) //dev为父设备
{
    return scsi_add_host_with_dma(host, dev, dev);
}

int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
               struct device *dma_dev)
{
    struct scsi_host_template *sht = shost->hostt;
    int error = -EINVAL;

    shost_printk(KERN_INFO, shost, "%s\n",
            sht->info ? sht->info(shost) : sht->name);

    if (!shost->can_queue) {
        shost_printk(KERN_ERR, shost,
                 "can_queue = 0 no longer supported\n");
        goto fail;
    }

    if (shost_use_blk_mq(shost)) {         //如果主机适配器设置使用多队列IO，则建立
        error = scsi_mq_setup_tags(shost); //相应的多队列环境
        if (error)
            goto fail;
    } else {
        shost->bqt = blk_init_tags(shost->can_queue,
                shost->hostt->tag_alloc_policy);
        if (!shost->bqt) {
            error = -ENOMEM;
            goto fail;
        }
    }

    /*
     * Note that we allocate the freelist even for the MQ case for now,
     * as we need a command set aside for scsi_reset_provider.  Having
     * the full host freelist and one command available for that is a
     * little heavy-handed, but avoids introducing a special allocator
     * just for this.  Eventually the structure of scsi_reset_provider
     * will need a major overhaul.
     */ //分配存储scsi命令和sense数据的缓冲区， 并分配scsi命令的备用仓库链表
    error = scsi_setup_command_freelist(shost);
    if (error)
        goto out_destroy_tags;

    //设置主机适配器的父设备，确定该设备在sysfs中的位置，通常会通过dev参数传入pci_dev。
    if (!shost->shost_gendev.parent)
        shost->shost_gendev.parent = dev ? dev : &platform_bus; //如果dev为NULL，设置为platform_bus
    if (!dma_dev)
        dma_dev = shost->shost_gendev.parent;

    shost->dma_dev = dma_dev;

    error = device_add(&shost->shost_gendev);  //添加主机适配器通用设备到系统
    if (error)
        goto out_destroy_freelist;

    pm_runtime_set_active(&shost->shost_gendev);
    pm_runtime_enable(&shost->shost_gendev);
    device_enable_async_suspend(&shost->shost_gendev); //支持异步挂起通用设备

    scsi_host_set_state(shost, SHOST_RUNNING);  //设置主机适配器状态
    get_device(shost->shost_gendev.parent);     //增加通用父设备的引用计数

    device_enable_async_suspend(&shost->shost_dev);  //支持异步挂起类设备

    error = device_add(&shost->shost_dev);    //添加主机适配器类设备到系统
    if (error)
        goto out_del_gendev;

    get_device(&shost->shost_gendev);

    if (shost->transportt->host_size) {  //scsi传输层使用的数据空间
        shost->shost_data = kzalloc(shost->transportt->host_size,
                     GFP_KERNEL);
        if (shost->shost_data == NULL) {
            error = -ENOMEM;
            goto out_del_dev;
        }
    }

    if (shost->transportt->create_work_queue) {
        snprintf(shost->work_q_name, sizeof(shost->work_q_name),
             "scsi_wq_%d", shost->host_no);
        shost->work_q = create_singlethread_workqueue( //分配被scsi传输层使用的工作队列
                    shost->work_q_name);
        if (!shost->work_q) {
            error = -EINVAL;
            goto out_free_shost_data;
        }
    }

    error = scsi_sysfs_add_host(shost); //添加主机适配器到子系统
    if (error)
        goto out_destroy_host;

    scsi_proc_host_add(shost);  //在procfs添加主机适配器信息
    return error;

 out_destroy_host:
    if (shost->work_q)
        destroy_workqueue(shost->work_q);
 out_free_shost_data:
    kfree(shost->shost_data);
 out_del_dev:
    device_del(&shost->shost_dev);
 out_del_gendev:
    device_del(&shost->shost_gendev);
 out_destroy_freelist:
    scsi_destroy_command_freelist(shost);
 out_destroy_tags:
    if (shost_use_blk_mq(shost))
        scsi_mq_destroy_tags(shost);
 fail:
    return error;
}
EXPORT_SYMBOL(scsi_add_host_with_dma);

设备探测过程

在系统启动过程中，会扫描默认的PCI根总线，从而触发了PCI设备扫描的过程，开始构造PCI设备树，SCSI主机适配器是挂载在PCI总线的设备。SCSI主机适配器做PCI设备会被PCI总线驱动层扫描到(PCI设备的扫描采用配置空间访问的方式)，扫描到SCSI主机适配器后，操作系统开始加载SCSI主机适配器驱动，SCSI主机适配器驱动就是上面所说的低层驱动。SCSI主机适配器驱动根据SCSI主机适配器驱动根据SCSI主机适配模板分配SCSI主机适配器描述符，并添加到系统，之后启动通过SCSI主机适配器扩展出来的下一级总线–SCSI总线的扫描过程。

SCSI中间层依次以可能的ID和LUN构造INQUIRY命令，之后将这些INQUIRY命令提交给块IO子系统，后者又最终将调用SCSI中间层的策略例程，再次提取到SCSI命令结构后，调用SCSI低层驱动的queuecommand回调函数实现。
对于给定ID的目标节点，如果它在SCSI总线上存在，那么它一定要实现对LUN0的INQUIRY响应。也就是说，如果向某个ID的目标节点的LUN0发送INQUIRY命令，或依次向各个LUN尝试发送INQUIRY命令，检查是否能收到响应，最终SCSI中间层能够得到SCSI域中的所连接的逻辑设备及其信息。

SCSI总线具体的扫描方式可以由具体的主机适配器固件、主机适配器驱动实现，在此只讨论由主机适配器驱动调用scsi中间层提供通用的扫描函数的实现方式scsi_scan_host。

void scsi_scan_host(struct Scsi_Host *shost)
{
    struct async_scan_data *data;

    if (strncmp(scsi_scan_type, "none", 4) == 0) //检查扫描逻辑
        return;
    if (scsi_autopm_get_host(shost) < 0)
        return;

    data = scsi_prep_async_scan(shost); //准备异步扫描
    if (!data) {
        do_scsi_scan_host(shost);    //同步扫描
        scsi_autopm_put_host(shost);
        return;
    }

    /* register with the async subsystem so wait_for_device_probe()
     * will flush this work
     */
    async_schedule(do_scan_async, data);  //异步扫描

    /* scsi_autopm_put_host(shost) is called in scsi_finish_async_scan() */
}
EXPORT_SYMBOL(scsi_scan_host);

scsi_scan_host函数是scsi中间层提供的主机适配器扫描函数，对于有主机适配器驱动有自定义扫描逻辑需求的可以设置主机适配器模板的回调函数，由scsi_scan_host函数来调用回调实现自定义扫描。
scsi_scan_type变量指定了扫描方式：async、sync、none。无论最终扫描方式是同步还是异步，都是由do_scsi_scan_host函数实现：

static void do_scsi_scan_host(struct Scsi_Host *shost)
{
    if (shost->hostt->scan_finished) {  //使用自定义扫描方式
        unsigned long start = jiffies;
        if (shost->hostt->scan_start)
            shost->hostt->scan_start(shost); //自定义扫描开始回调

        while (!shost->hostt->scan_finished(shost, jiffies - start)) //自定义扫描完成时返回1
            msleep(10);
    } else { //scsi子系统通用扫描函数， SCAN_WILD_CARD表示扫描所有的target和device
        scsi_scan_host_selected(shost, SCAN_WILD_CARD, SCAN_WILD_CARD,
                SCAN_WILD_CARD, 0);
    }
}

如果主机适配器模板设置了自定义扫描函数，do_scsi_scan_host函数将会调用。如果没有设置则使用默认的扫描函数scsi_scan_host_selected执行扫描。

int scsi_scan_host_selected(struct Scsi_Host *shost, unsigned int channel,
                unsigned int id, u64 lun, int rescan)
{
    SCSI_LOG_SCAN_BUS(3, shost_printk (KERN_INFO, shost,
        "%s: <%u:%u:%llu>\n",
        __func__, channel, id, lun));
    //检查channel、id、lun是否有效
    if (((channel != SCAN_WILD_CARD) && (channel > shost->max_channel)) ||
        ((id != SCAN_WILD_CARD) && (id >= shost->max_id)) ||
        ((lun != SCAN_WILD_CARD) && (lun >= shost->max_lun)))
        return -EINVAL;

    mutex_lock(&shost->scan_mutex);
    if (!shost->async_scan)
        scsi_complete_async_scans();
    //检查Scsi_Host的状态是否允许扫描
    if (scsi_host_scan_allowed(shost) && scsi_autopm_get_host(shost) == 0) {
        if (channel == SCAN_WILD_CARD)
            for (channel = 0; channel <= shost->max_channel; //遍历所有的channel进行扫描
                 channel++)
                scsi_scan_channel(shost, channel, id, lun,  //扫描channel
                          rescan);
        else
            scsi_scan_channel(shost, channel, id, lun, rescan); //扫描指定的channel
        scsi_autopm_put_host(shost);
    }
    mutex_unlock(&shost->scan_mutex);

    return 0;
}

scsi_scan_host_selected函数扫描指定的主机适配器，根据输入的参数决定是否遍历扫描所有channel或扫描指定channel，通过函数scsi_scan_channel完成。

static void scsi_scan_channel(struct Scsi_Host *shost, unsigned int channel,
                  unsigned int id, u64 lun, int rescan)
{
    uint order_id;

    if (id == SCAN_WILD_CARD)
        for (id = 0; id < shost->max_id; ++id) {  //遍历所有的target
            /*
             * XXX adapter drivers when possible (FCP, iSCSI)
             * could modify max_id to match the current max,
             * not the absolute max.
             *
             * XXX add a shost id iterator, so for example,
             * the FC ID can be the same as a target id
             * without a huge overhead of sparse id's.
             */
            if (shost->reverse_ordering)
                /*
                 * Scan from high to low id.
                 */
                order_id = shost->max_id - id - 1;
            else
                order_id = id;
            __scsi_scan_target(&shost->shost_gendev, channel, //扫描指定的target
                    order_id, lun, rescan);
        }
    else
        __scsi_scan_target(&shost->shost_gendev, channel,
                id, lun, rescan);
}

__scsi_scan_target函数指定扫描target内部的lun。

static void __scsi_scan_target(struct device *parent, unsigned int channel,
        unsigned int id, u64 lun, int rescan)
{
    struct Scsi_Host *shost = dev_to_shost(parent);
    int bflags = 0;
    int res;
    struct scsi_target *starget;

    if (shost->this_id == id)
        /*
         * Don't scan the host adapter
         */
        return;
    //为指定的id分配target数据结构,并初始化
    starget = scsi_alloc_target(parent, channel, id);
    if (!starget)
        return;
    scsi_autopm_get_target(starget);

    if (lun != SCAN_WILD_CARD) {
        /*
         * Scan for a specific host/chan/id/lun.
         */ //扫描target中指定id的scsi_device(lun)，并将scsi_device(lun)添加到子系统
        scsi_probe_and_add_lun(starget, lun, NULL, NULL, rescan, NULL);
        goto out_reap;
    }

    /*
     * Scan LUN 0, if there is some response, scan further. Ideally, we
     * would not configure LUN 0 until all LUNs are scanned.
     */ //探测target的LUN0
    res = scsi_probe_and_add_lun(starget, 0, &bflags, NULL, rescan, NULL);
    if (res == SCSI_SCAN_LUN_PRESENT || res == SCSI_SCAN_TARGET_PRESENT) {
        if (scsi_report_lun_scan(starget, bflags, rescan) != 0) //向target lun 0发送REPORT_LUNS
            /*
             * The REPORT LUN did not scan the target,
             * do a sequential scan.
             */
            scsi_sequential_lun_scan(starget, bflags,  //探测REPORT_LUNS上报的lun
                         starget->scsi_level, rescan);
    }

 out_reap:
    scsi_autopm_put_target(starget);
    /*
     * paired with scsi_alloc_target(): determine if the target has
     * any children at all and if not, nuke it
     */
    scsi_target_reap(starget);

    put_device(&starget->dev);
}

扫描到target时分配并初始化scsi_target结构，scsi_probe_and_add_lun函数完成探测target中的lun，并将发现的lun添加到系统。

static int scsi_probe_and_add_lun(struct scsi_target *starget,
                  u64 lun, int *bflagsp,
                  struct scsi_device **sdevp, int rescan,
                  void *hostdata)
{
    struct scsi_device *sdev;
    unsigned char *result;
    int bflags, res = SCSI_SCAN_NO_RESPONSE, result_len = 256;
    struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);

    /*
     * The rescan flag is used as an optimization, the first scan of a
     * host adapter calls into here with rescan == 0.
     */
    sdev = scsi_device_lookup_by_target(starget, lun);  //寻找target中指定id的lun
    if (sdev) {   //target中已经存在lun
        if (rescan || !scsi_device_created(sdev)) { //rescan参数要求重新扫描该lun
            SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,
                "scsi scan: device exists on %s\n",
                dev_name(&sdev->sdev_gendev)));
            if (sdevp)
                *sdevp = sdev;
            else
                scsi_device_put(sdev);

            if (bflagsp)
                *bflagsp = scsi_get_device_flags(sdev,
                                 sdev->vendor,
                                 sdev->model);
            return SCSI_SCAN_LUN_PRESENT;
        }
        scsi_device_put(sdev);
    } else
        sdev = scsi_alloc_sdev(starget, lun, hostdata); //target中不存在lun，分配scsi_device
    if (!sdev)
        goto out;

    result = kmalloc(result_len, GFP_ATOMIC |
            ((shost->unchecked_isa_dma) ? __GFP_DMA : 0));
    if (!result)
        goto out_free_sdev;

    if (scsi_probe_lun(sdev, result, result_len, &bflags)) //发送INQUIRY到具体device，进行探测
        goto out_free_result;

    if (bflagsp)
        *bflagsp = bflags;
    /*
     * result contains valid SCSI INQUIRY data.
     */
    if (((result[0] >> 5) == 3) && !(bflags & BLIST_ATTACH_PQ3)) {
        /*
         * For a Peripheral qualifier 3 (011b), the SCSI
         * spec says: The device server is not capable of
         * supporting a physical device on this logical
         * unit.
         *
         * For disks, this implies that there is no
         * logical disk configured at sdev->lun, but there
         * is a target id responding.
         */
        SCSI_LOG_SCAN_BUS(2, sdev_printk(KERN_INFO, sdev, "scsi scan:"
                   " peripheral qualifier of 3, device not"
                   " added\n"))
        if (lun == 0) {
            SCSI_LOG_SCAN_BUS(1, {
                unsigned char vend[9];
                unsigned char mod[17];

                sdev_printk(KERN_INFO, sdev,
                    "scsi scan: consider passing scsi_mod."
                    "dev_flags=%s:%s:0x240 or 0x1000240\n",
                    scsi_inq_str(vend, result, 8, 16),
                    scsi_inq_str(mod, result, 16, 32));
            });

        }

        res = SCSI_SCAN_TARGET_PRESENT;
        goto out_free_result;
    }

    /*
     * Some targets may set slight variations of PQ and PDT to signal
     * that no LUN is present, so don't add sdev in these cases.
     * Two specific examples are:
     * 1) NetApp targets: return PQ=1, PDT=0x1f
     * 2) USB UFI: returns PDT=0x1f, with the PQ bits being "reserved"
     *    in the UFI 1.0 spec (we cannot rely on reserved bits).
     *
     * References:
     * 1) SCSI SPC-3, pp. 145-146
     * PQ=1: "A peripheral device having the specified peripheral
     * device type is not connected to this logical unit. However, the
     * device server is capable of supporting the specified peripheral
     * device type on this logical unit."
     * PDT=0x1f: "Unknown or no device type"
     * 2) USB UFI 1.0, p. 20
     * PDT=00h Direct-access device (floppy)
     * PDT=1Fh none (no FDD connected to the requested logical unit)
     */
    if (((result[0] >> 5) == 1 || starget->pdt_1f_for_no_lun) &&
        (result[0] & 0x1f) == 0x1f &&
        !scsi_is_wlun(lun)) {
        SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,
                    "scsi scan: peripheral device type"
                    " of 31, no device added\n"));
        res = SCSI_SCAN_TARGET_PRESENT;
        goto out_free_result;
    }
    //添加scsi设备到子系统
    res = scsi_add_lun(sdev, result, &bflags, shost->async_scan);
    if (res == SCSI_SCAN_LUN_PRESENT) {
        if (bflags & BLIST_KEY) {
            sdev->lockable = 0;
            scsi_unlock_floptical(sdev, result);
        }
    }

 out_free_result:
    kfree(result);
 out_free_sdev:
    if (res == SCSI_SCAN_LUN_PRESENT) {
        if (sdevp) {
            if (scsi_device_get(sdev) == 0) {
                *sdevp = sdev;
            } else {
                __scsi_remove_device(sdev);
                res = SCSI_SCAN_NO_RESPONSE;
            }
        }
    } else
        __scsi_remove_device(sdev);
 out:
    return res;
}

scsi_probe_and_add_lun函数由名字可知，完成lun的probe和add两个操作：
1. 探测逻辑设备scsi_probe_lun，发送INQUIRY命令到具体设备。
2. 添加逻辑设备到系统scsi_add_lun，根据INQUIRY命令返回值添加lun到系统。

static int scsi_probe_lun(struct scsi_device *sdev, unsigned char *inq_result,
              int result_len, int *bflags)
{
    unsigned char scsi_cmd[MAX_COMMAND_SIZE];
    int first_inquiry_len, try_inquiry_len, next_inquiry_len;
    int response_len = 0;
    int pass, count, result;
    struct scsi_sense_hdr sshdr;

    *bflags = 0;

    /* Perform up to 3 passes.  The first pass uses a conservative
     * transfer length of 36 unless sdev->inquiry_len specifies a
     * different value. */
    first_inquiry_len = sdev->inquiry_len ? sdev->inquiry_len : 36;
    try_inquiry_len = first_inquiry_len;
    pass = 1;

 next_pass:
    SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,
                "scsi scan: INQUIRY pass %d length %d\n",
                pass, try_inquiry_len));

    /* Each pass gets up to three chances to ignore Unit Attention */
    for (count = 0; count < 3; ++count) {
        int resid;

        memset(scsi_cmd, 0, 6);
        scsi_cmd[0] = INQUIRY;      //命令类型是INQUIRY
        scsi_cmd[4] = (unsigned char) try_inquiry_len;

        memset(inq_result, 0, try_inquiry_len);
        //发送SCSI命令，重试3次
        result = scsi_execute_req(sdev,  scsi_cmd, DMA_FROM_DEVICE,
                      inq_result, try_inquiry_len, &sshdr,
                      HZ / 2 + HZ * scsi_inq_timeout, 3,
                      &resid);

        SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,
                "scsi scan: INQUIRY %s with code 0x%x\n",
                result ? "failed" : "successful", result));

        if (result) {
            /*
             * not-ready to ready transition [asc/ascq=0x28/0x0]
             * or power-on, reset [asc/ascq=0x29/0x0], continue.
             * INQUIRY should not yield UNIT_ATTENTION
             * but many buggy devices do so anyway.
             */
            if ((driver_byte(result) & DRIVER_SENSE) &&
                scsi_sense_valid(&sshdr)) {
                if ((sshdr.sense_key == UNIT_ATTENTION) &&
                    ((sshdr.asc == 0x28) ||
                     (sshdr.asc == 0x29)) &&
                    (sshdr.ascq == 0))
                    continue;
            }
        } else {
            /*
             * if nothing was transferred, we try
             * again. It's a workaround for some USB
             * devices.
             */
            if (resid == try_inquiry_len)
                continue;
        }
        break;
    }

    if (result == 0) {
        sanitize_inquiry_string(&inq_result[8], 8);
        sanitize_inquiry_string(&inq_result[16], 16);
        sanitize_inquiry_string(&inq_result[32], 4);

        response_len = inq_result[4] + 5;
        if (response_len > 255)
            response_len = first_inquiry_len;   /* sanity */

        /*
         * Get any flags for this device.
         *
         * XXX add a bflags to scsi_device, and replace the
         * corresponding bit fields in scsi_device, so bflags
         * need not be passed as an argument.
         */
        *bflags = scsi_get_device_flags(sdev, &inq_result[8],
                &inq_result[16]);

        /* When the first pass succeeds we gain information about
         * what larger transfer lengths might work. */
        if (pass == 1) {
            if (BLIST_INQUIRY_36 & *bflags)
                next_inquiry_len = 36;
            else if (BLIST_INQUIRY_58 & *bflags)
                next_inquiry_len = 58;
            else if (sdev->inquiry_len)
                next_inquiry_len = sdev->inquiry_len;
            else
                next_inquiry_len = response_len;

            /* If more data is available perform the second pass */
            if (next_inquiry_len > try_inquiry_len) {
                try_inquiry_len = next_inquiry_len;
                pass = 2;
                goto next_pass;
            }
        }

    } else if (pass == 2) {
        sdev_printk(KERN_INFO, sdev,
                "scsi scan: %d byte inquiry failed.  "
                "Consider BLIST_INQUIRY_36 for this device\n",
                try_inquiry_len);

        /* If this pass failed, the third pass goes back and transfers
         * the same amount as we successfully got in the first pass. */
        try_inquiry_len = first_inquiry_len;
        pass = 3;
        goto next_pass;
    }

    /* If the last transfer attempt got an error, assume the
     * peripheral doesn't exist or is dead. */
    if (result)
        return -EIO;

    /* Don't report any more data than the device says is valid */
    sdev->inquiry_len = min(try_inquiry_len, response_len);

    /*
     * XXX Abort if the response length is less than 36? If less than
     * 32, the lookup of the device flags (above) could be invalid,
     * and it would be possible to take an incorrect action - we do
     * not want to hang because of a short INQUIRY. On the flip side,
     * if the device is spun down or becoming ready (and so it gives a
     * short INQUIRY), an abort here prevents any further use of the
     * device, including spin up.
     *
     * On the whole, the best approach seems to be to assume the first
     * 36 bytes are valid no matter what the device says.  That's
     * better than copying < 36 bytes to the inquiry-result buffer
     * and displaying garbage for the Vendor, Product, or Revision
     * strings.
     */
    if (sdev->inquiry_len < 36) {
        if (!sdev->host->short_inquiry) {
            shost_printk(KERN_INFO, sdev->host,
                    "scsi scan: INQUIRY result too short (%d),"
                    " using 36\n", sdev->inquiry_len);
            sdev->host->short_inquiry = 1;
        }
        sdev->inquiry_len = 36;
    }

    /*
     * Related to the above issue:
     *
     * XXX Devices (disk or all?) should be sent a TEST UNIT READY,
     * and if not ready, sent a START_STOP to start (maybe spin up) and
     * then send the INQUIRY again, since the INQUIRY can change after
     * a device is initialized.
     *
     * Ideally, start a device if explicitly asked to do so.  This
     * assumes that a device is spun up on power on, spun down on
     * request, and then spun up on request.
     */

    /*
     * The scanning code needs to know the scsi_level, even if no
     * device is attached at LUN 0 (SCSI_SCAN_TARGET_PRESENT) so
     * non-zero LUNs can be scanned.
     */
    sdev->scsi_level = inq_result[2] & 0x07;
    if (sdev->scsi_level >= 2 ||
        (sdev->scsi_level == 1 && (inq_result[3] & 0x0f) == 1))
        sdev->scsi_level++;
    sdev->sdev_target->scsi_level = sdev->scsi_level;

    /*
     * If SCSI-2 or lower, and if the transport requires it,
     * store the LUN value in CDB[1].
     */
    sdev->lun_in_cdb = 0;
    if (sdev->scsi_level <= SCSI_2 &&
        sdev->scsi_level != SCSI_UNKNOWN &&
        !sdev->host->no_scsi2_lun_in_cdb)
        sdev->lun_in_cdb = 1;

    return 0;
}


static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
        int *bflags, int async)
{
    int ret;

    /*
     * XXX do not save the inquiry, since it can change underneath us,
     * save just vendor/model/rev.
     *
     * Rather than save it and have an ioctl that retrieves the saved
     * value, have an ioctl that executes the same INQUIRY code used
     * in scsi_probe_lun, let user level programs doing INQUIRY
     * scanning run at their own risk, or supply a user level program
     * that can correctly scan.
     */

    /*
     * Copy at least 36 bytes of INQUIRY data, so that we don't
     * dereference unallocated memory when accessing the Vendor,
     * Product, and Revision strings.  Badly behaved devices may set
     * the INQUIRY Additional Length byte to a small value, indicating
     * these strings are invalid, but often they contain plausible data
     * nonetheless.  It doesn't matter if the device sent < 36 bytes
     * total, since scsi_probe_lun() initializes inq_result with 0s.
     */
    sdev->inquiry = kmemdup(inq_result,
                max_t(size_t, sdev->inquiry_len, 36),
                GFP_ATOMIC);
    if (sdev->inquiry == NULL)
        return SCSI_SCAN_NO_RESPONSE;

    sdev->vendor = (char *) (sdev->inquiry + 8); //第8个字节到第15个字节是vendor identification
    sdev->model = (char *) (sdev->inquiry + 16); //第16个字节到第31个字节是product identification
    sdev->rev = (char *) (sdev->inquiry + 32);   //第32个字节到第35个字节是product revision level

    if (strncmp(sdev->vendor, "ATA     ", 8) == 0) {
        /*
         * sata emulation layer device.  This is a hack to work around
         * the SATL power management specifications which state that
         * when the SATL detects the device has gone into standby
         * mode, it shall respond with NOT READY.
         */
        sdev->allow_restart = 1;
    }

    if (*bflags & BLIST_ISROM) {
        sdev->type = TYPE_ROM;
        sdev->removable = 1;
    } else {
        sdev->type = (inq_result[0] & 0x1f);
        sdev->removable = (inq_result[1] & 0x80) >> 7;

        /*
         * some devices may respond with wrong type for
         * well-known logical units. Force well-known type
         * to enumerate them correctly.
         */
        if (scsi_is_wlun(sdev->lun) && sdev->type != TYPE_WLUN) {
            sdev_printk(KERN_WARNING, sdev,
                "%s: correcting incorrect peripheral device type 0x%x for W-LUN 0x%16xhN\n",
                __func__, sdev->type, (unsigned int)sdev->lun);
            sdev->type = TYPE_WLUN;
        }

    }

    if (sdev->type == TYPE_RBC || sdev->type == TYPE_ROM) {
        /* RBC and MMC devices can return SCSI-3 compliance and yet
         * still not support REPORT LUNS, so make them act as
         * BLIST_NOREPORTLUN unless BLIST_REPORTLUN2 is
         * specifically set */
        if ((*bflags & BLIST_REPORTLUN2) == 0)
            *bflags |= BLIST_NOREPORTLUN;
    }

    /*
     * For a peripheral qualifier (PQ) value of 1 (001b), the SCSI
     * spec says: The device server is capable of supporting the
     * specified peripheral device type on this logical unit. However,
     * the physical device is not currently connected to this logical
     * unit.
     *
     * The above is vague, as it implies that we could treat 001 and
     * 011 the same. Stay compatible with previous code, and create a
     * scsi_device for a PQ of 1
     *
     * Don't set the device offline here; rather let the upper
     * level drivers eval the PQ to decide whether they should
     * attach. So remove ((inq_result[0] >> 5) & 7) == 1 check.
     */

    sdev->inq_periph_qual = (inq_result[0] >> 5) & 7;
    sdev->lockable = sdev->removable;
    sdev->soft_reset = (inq_result[7] & 1) && ((inq_result[3] & 7) == 2);

    if (sdev->scsi_level >= SCSI_3 ||
            (sdev->inquiry_len > 56 && inq_result[56] & 0x04))
        sdev->ppr = 1;
    if (inq_result[7] & 0x60)
        sdev->wdtr = 1;
    if (inq_result[7] & 0x10)
        sdev->sdtr = 1;

    sdev_printk(KERN_NOTICE, sdev, "%s %.8s %.16s %.4s PQ: %d "
            "ANSI: %d%s\n", scsi_device_type(sdev->type),
            sdev->vendor, sdev->model, sdev->rev,
            sdev->inq_periph_qual, inq_result[2] & 0x07,
            (inq_result[3] & 0x0f) == 1 ? " CCS" : "");

    if ((sdev->scsi_level >= SCSI_2) && (inq_result[7] & 2) &&
        !(*bflags & BLIST_NOTQ)) {
        sdev->tagged_supported = 1;
        sdev->simple_tags = 1;
    }

    /*
     * Some devices (Texel CD ROM drives) have handshaking problems
     * when used with the Seagate controllers. borken is initialized
     * to 1, and then set it to 0 here.
     */
    if ((*bflags & BLIST_BORKEN) == 0)
        sdev->borken = 0;

    if (*bflags & BLIST_NO_ULD_ATTACH)
        sdev->no_uld_attach = 1;

    /*
     * Apparently some really broken devices (contrary to the SCSI
     * standards) need to be selected without asserting ATN
     */
    if (*bflags & BLIST_SELECT_NO_ATN)
        sdev->select_no_atn = 1;

    /*
     * Maximum 512 sector transfer length
     * broken RA4x00 Compaq Disk Array
     */
    if (*bflags & BLIST_MAX_512)
        blk_queue_max_hw_sectors(sdev->request_queue, 512);
    /*
     * Max 1024 sector transfer length for targets that report incorrect
     * max/optimal lengths and relied on the old block layer safe default
     */
    else if (*bflags & BLIST_MAX_1024)
        blk_queue_max_hw_sectors(sdev->request_queue, 1024);

    /*
     * Some devices may not want to have a start command automatically
     * issued when a device is added.
     */
    if (*bflags & BLIST_NOSTARTONADD)
        sdev->no_start_on_add = 1;

    if (*bflags & BLIST_SINGLELUN)
        scsi_target(sdev)->single_lun = 1;

    sdev->use_10_for_rw = 1;

    if (*bflags & BLIST_MS_SKIP_PAGE_08)
        sdev->skip_ms_page_8 = 1;

    if (*bflags & BLIST_MS_SKIP_PAGE_3F)
        sdev->skip_ms_page_3f = 1;

    if (*bflags & BLIST_USE_10_BYTE_MS)
        sdev->use_10_for_ms = 1;

    /* some devices don't like REPORT SUPPORTED OPERATION CODES
     * and will simply timeout causing sd_mod init to take a very
     * very long time */
    if (*bflags & BLIST_NO_RSOC)
        sdev->no_report_opcodes = 1;

    /* set the device running here so that slave configure
     * may do I/O */
    ret = scsi_device_set_state(sdev, SDEV_RUNNING); //状态
    if (ret) {
        ret = scsi_device_set_state(sdev, SDEV_BLOCK);

        if (ret) {
            sdev_printk(KERN_ERR, sdev,
                    "in wrong state %s to complete scan\n",
                    scsi_device_state_name(sdev->sdev_state));
            return SCSI_SCAN_NO_RESPONSE;
        }
    }

    if (*bflags & BLIST_MS_192_BYTES_FOR_3F)
        sdev->use_192_bytes_for_3f = 1;

    if (*bflags & BLIST_NOT_LOCKABLE)
        sdev->lockable = 0;

    if (*bflags & BLIST_RETRY_HWERROR)
        sdev->retry_hwerror = 1;

    if (*bflags & BLIST_NO_DIF)
        sdev->no_dif = 1;

    sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT;

    if (*bflags & BLIST_TRY_VPD_PAGES)
        sdev->try_vpd_pages = 1;
    else if (*bflags & BLIST_SKIP_VPD_PAGES)
        sdev->skip_vpd_pages = 1;

    transport_configure_device(&sdev->sdev_gendev); //把lun配置到scsi传输层

    if (sdev->host->hostt->slave_configure) {
        ret = sdev->host->hostt->slave_configure(sdev); //主机适配器模板设置的回调，对scsi_device(lun)执行特定的初始化
        if (ret) {
            /*
             * if LLDD reports slave not present, don't clutter
             * console with alloc failure messages
             */
            if (ret != -ENXIO) {
                sdev_printk(KERN_ERR, sdev,
                    "failed to configure device\n");
            }
            return SCSI_SCAN_NO_RESPONSE;
        }
    }

    if (sdev->scsi_level >= SCSI_3)
        scsi_attach_vpd(sdev);

    sdev->max_queue_depth = sdev->queue_depth;  //设置最大队列深度

    /*
     * Ok, the device is now all set up, we can
     * register it and tell the rest of the kernel
     * about it.
     */ //添加scsi_device(lun)到sysfs
    if (!async && scsi_sysfs_add_sdev(sdev) != 0)
        return SCSI_SCAN_NO_RESPONSE;

    return SCSI_SCAN_LUN_PRESENT;
}