电竞比分网-中国电竞赛事及体育赛事平台

分享

Linux存儲IO棧

 waston 2019-03-08
本系列文章將自底向上分析Linux存儲IO棧源碼(基于4.4.19),為學習Linux存儲做記錄。具體目錄如下:
一、 Linux內(nèi)核對象與對象集
二、 sysfs
三、 設(shè)備模型
四、 SCSI子系統(tǒng)

五、 SCSI磁盤驅(qū)動sd

六、 SCSI Target--TCM

七、 用戶空間IO--UIO

八、 在用戶空間實現(xiàn)虛擬SCSI磁盤--TCMU

九、 通用塊層

十、文件系統(tǒng)--VFS


Linux內(nèi)核對象和對象集

內(nèi)核對象作為Linux設(shè)備驅(qū)動模型的基礎(chǔ),主要是抽象和封裝總線、設(shè)備、驅(qū)動、類和接口之間的關(guān)系具體實現(xiàn)的相關(guān)代碼,并在sysfs中呈現(xiàn)。主要抽象成kobject和kset結(jié)構(gòu):

struct kobject {
    const char      *name;   //在sysfs中顯示的名稱
    struct list_head    entry;   //鏈入kset的kobj鏈表
    struct kobject      *parent; //指向父kobject,用于表示樹形結(jié)構(gòu)
    struct kset     *kset;   //指向鏈入的kset
    struct kobj_type    *ktype;  //抽象kobject的通用方法和屬性
    struct kernfs_node  *sd;     //sysfs directory entry 
    struct kref     kref;    //引用計數(shù) 
#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
    struct delayed_work release; 
#endif
    unsigned int state_initialized:1;  //是否被初始化
    unsigned int state_in_sysfs:1;     //是否被添加到sysfs
    unsigned int state_add_uevent_sent:1; //是否發(fā)送ADD事件到用戶空間
    unsigned int state_remove_uevent_sent:1; //是否發(fā)送REMOVE事件到用戶空間
    unsigned int uevent_suppress:1; //事件是否被抑制
};

在kobject結(jié)構(gòu)中ktype域是對kobject一些通用方法和屬性進行封裝:

struct kobj_type {
    void (*release)(struct kobject *kobj); //釋放kobject結(jié)構(gòu)時回調(diào)
    const struct sysfs_ops *sysfs_ops; //sysfs的操作函數(shù)
    struct attribute **default_attrs;  //默認屬性
        //命名空間相關(guān)操作
    const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj);
    const void *(*namespace)(struct kobject *kobj);
};

kset是一組kobject的集合,通過kset可以遍歷這組kobject,如SCSI子系統(tǒng)中,設(shè)備是一種kobject,通過設(shè)備集kset,可以遍歷所有的設(shè)備。

/**
 * struct kset - a set of kobjects of a specific type, belonging to a specific subsystem.
 *
 * A kset defines a group of kobjects.  They can be individually
 * different "types" but overall these kobjects all want to be grouped
 * together and operated on in the same manner.  ksets are used to
 * define the attribute callbacks and other common events that happen to
 * a kobject.
 *
 * @list: the list of all kobjects for this kset
 * @list_lock: a lock for iterating over the kobjects
 * @kobj: the embedded kobject for this kset (recursion, isn't it fun...)
 * @uevent_ops: the set of uevent operations for this kset.  These are
 * called whenever a kobject has something happen to it so that the kset
 * can add new environment variables, or filter out the uevents if so
 * desired.
 */
struct kset {
    struct list_head list; //鏈入kset的kobject鏈表
    spinlock_t list_lock;  //遍歷鏈表是的自旋鎖struct kobject kobj;   //本身可以當做kobject對待
    const struct kset_uevent_ops *uevent_ops; //發(fā)送uevent事件的回調(diào)函數(shù)
};

在發(fā)送事件到用戶空間時,可以回調(diào)kset_uevent_ops中的3個回調(diào)函數(shù)

struct kset_uevent_ops {
    int (* const filter)(struct kset *kset, struct kobject *kobj);
    const char *(* const name)(struct kset *kset, struct kobject *kobj);
    int (* const uevent)(struct kset *kset, struct kobject *kobj,
              struct kobj_uevent_env *env);
};
  • filter:在發(fā)送事件之前的過濾某些事件。

  • name: 獲取名稱。

  • uevent:設(shè)置uevent需要的環(huán)境變量。

內(nèi)核對象關(guān)系

內(nèi)核對象相關(guān)操作

void kobject_init(struct kobject *kobj, struct kobj_type *ktype);
int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...);
int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype, struct kobject *parent, const char *fmt, ...);
void kobject_del(struct kobject *kobj);
struct kobject *  kobject_create(void);
struct kobject * kobject_create_and_add(const char *name, struct kobject *parent);
int kobject_rename(struct kobject *, const char *new_name);
int kobject_move(struct kobject *, struct kobject *);
struct kobject *kobject_get(struct kobject *kobj);
void kobject_put(struct kobject *kobj);
const void *kobject_namespace(struct kobject *kobj);
char *kobject_get_path(struct kobject *kobj, gfp_t flag);

內(nèi)核對象創(chuàng)建及初始化

初始化流程主要在kobject_init:

/**
 * kobject_init - initialize a kobject structure
 * @kobj: pointer to the kobject to initialize
 * @ktype: pointer to the ktype for this kobject.
 *
 * This function will properly initialize a kobject such that it can then
 * be passed to the kobject_add() call.
 *
 * After this function is called, the kobject MUST be cleaned up by a call
 * to kobject_put(), not by a call to kfree directly to ensure that all of
 * the memory is cleaned up properly.
 */
void kobject_init(struct kobject *kobj, struct kobj_type *ktype)
{
    char *err_str;

    if (!kobj) {
        err_str = "invalid kobject pointer!";
        goto error;
    }
    if (!ktype) {
        err_str = "must have a ktype to be initialized properly!\n";
        goto error;
    }
    if (kobj->state_initialized) {  //避免重復初始化
        /* do not error out as sometimes we can recover */
        printk(KERN_ERR "kobject (%p): tried to init an initialized "
               "object, something is seriously wrong.\n", kobj);
        dump_stack();
    }

    kobject_init_internal(kobj); //完成初始化的主要函數(shù)
    kobj->ktype = ktype;
    return;

error:
    printk(KERN_ERR "kobject (%p): %s\n", kobj, err_str);
    dump_stack();
}
EXPORT_SYMBOL(kobject_init);

由上面函數(shù)可以看出由kobject_init_internal完成初始化:

static void kobject_init_internal(struct kobject *kobj)
{
    if (!kobj)
        return;
    kref_init(&kobj->kref);
    INIT_LIST_HEAD(&kobj->entry);
    kobj->state_in_sysfs = 0;
    kobj->state_add_uevent_sent = 0;
    kobj->state_remove_uevent_sent = 0;
    kobj->state_initialized = 1;
}

kobject_create函數(shù)僅僅是在調(diào)用kobject_init之前,先分配kobject空間。在kobject初始化之后,需要調(diào)用kobject_add將kobject添加到sysfs中。

/**
 * kobject_add - the main kobject add function
 * @kobj: the kobject to add
 * @parent: pointer to the parent of the kobject.
 * @fmt: format to name the kobject with.
 *
 * The kobject name is set and added to the kobject hierarchy in this
 * function.
 *
 * If @parent is set, then the parent of the @kobj will be set to it.
 * If @parent is NULL, then the parent of the @kobj will be set to the
 * kobject associated with the kset assigned to this kobject.  If no kset
 * is assigned to the kobject, then the kobject will be located in the
 * root of the sysfs tree.
 *
 * If this function returns an error, kobject_put() must be called to
 * properly clean up the memory associated with the object.
 * Under no instance should the kobject that is passed to this function
 * be directly freed with a call to kfree(), that can leak memory.
 *
 * Note, no "add" uevent will be created with this call, the caller should set
 * up all of the necessary sysfs files for the object and then call
 * kobject_uevent() with the UEVENT_ADD parameter to ensure that
 * userspace is properly notified of this kobject's creation.
 */
int kobject_add(struct kobject *kobj, struct kobject *parent,
        const char *fmt, ...)
{
    va_list args;
    int retval;

    if (!kobj)
        return -EINVAL;

    if (!kobj->state_initialized) { //add之前需要初始化
        printk(KERN_ERR "kobject '%s' (%p): tried to add an "
               "uninitialized object, something is seriously wrong.\n",
               kobject_name(kobj), kobj);
        dump_stack();
        return -EINVAL;
    }
    va_start(args, fmt);
    retval = kobject_add_varg(kobj, parent, fmt, args); //主要完成add操作
    va_end(args);

    return retval;
}
EXPORT_SYMBOL(kobject_add);

kobject_add_varg/kobject_add_internal主要完成將kobject添加到sysfs的操作:

static __printf(3, 0) int kobject_add_varg(struct kobject *kobj,
                       struct kobject *parent,
                       const char *fmt, va_list vargs)
{
    int retval;
        //設(shè)置kobject在sysfs中顯示的名稱
    retval = kobject_set_name_vargs(kobj, fmt, vargs);
    if (retval) {
        printk(KERN_ERR "kobject: can not set name properly!\n");
        return retval;
    }
    kobj->parent = parent;
    return kobject_add_internal(kobj); //主要實現(xiàn)函數(shù)
}

static int kobject_add_internal(struct kobject *kobj)
{
    int error = 0;
    struct kobject *parent;

    if (!kobj)
        return -ENOENT;

    if (!kobj->name || !kobj->name[0]) {
        WARN(1, "kobject: (%p): attempted to be registered with empty "
             "name!\n", kobj);
        return -EINVAL;
    }

    parent = kobject_get(kobj->parent); //增加父對象的引用計數(shù)

    /* join kset if set, use it as parent if we do not already have one */
    if (kobj->kset) { //如果設(shè)置了kset,而沒有設(shè)置parent,則把kset的kobject設(shè)置為parent
        if (!parent)
            parent = kobject_get(&kobj->kset->kobj);
        kobj_kset_join(kobj);
        kobj->parent = parent;
    }

    pr_debug("kobject: '%s' (%p): %s: parent: '%s', set: '%s'\n",
         kobject_name(kobj), kobj, __func__,
         parent ? kobject_name(parent) : "<NULL>",
         kobj->kset ? kobject_name(&kobj->kset->kobj) : "<NULL>");

    error = create_dir(kobj);  //創(chuàng)建sysfs對應的目錄和屬性文件
    if (error) {  //出錯回滾
        kobj_kset_leave(kobj);
        kobject_put(parent);
        kobj->parent = NULL;

        /* be noisy on error issues */
        if (error == -EEXIST)
            WARN(1, "%s failed for %s with "
                 "-EEXIST, don't try to register things with "
                 "the same name in the same directory.\n",
                 __func__, kobject_name(kobj));
        else
            WARN(1, "%s failed for %s (error: %d parent: %s)\n",
                 __func__, kobject_name(kobj), error,
                 parent ? kobject_name(parent) : "'none'");
    } else
        kobj->state_in_sysfs = 1; //更新標志位

    return error;
}

由create_dir在sysfs創(chuàng)建真實的目錄和文件,這點有下一篇sysfs詳細描述。理解了kobject_init和kobject_add之后,由名字可以知道下面函數(shù)kobject_init_and_add和kobject_create_and_add

內(nèi)核對象釋放

調(diào)用kobject_del將對kobject釋放:

/**
 * kobject_del - unlink kobject from hierarchy.
 * @kobj: object.
 */
void kobject_del(struct kobject *kobj)
{
    struct kernfs_node *sd;

    if (!kobj)
        return;

    sd = kobj->sd;
    sysfs_remove_dir(kobj); //刪除kobject在sysfs中的目錄
    sysfs_put(sd);

    kobj->state_in_sysfs = 0; //設(shè)置標志位
    kobj_kset_leave(kobj);  //kobject脫離kset鏈表
    kobject_put(kobj->parent); //調(diào)用kobject_release釋放
    kobj->parent = NULL;
}
EXPORT_SYMBOL(kobject_del);

/**
 * kobject_put - decrement refcount for object.
 * @kobj: object.
 *
 * Decrement the refcount, and if 0, call kobject_cleanup().
 */
void kobject_put(struct kobject *kobj)
{
    if (kobj) {
        if (!kobj->state_initialized)
            WARN(1, KERN_WARNING "kobject: '%s' (%p): is not "
                   "initialized, yet kobject_put() is being "
                   "called.\n", kobject_name(kobj), kobj);
        kref_put(&kobj->kref, kobject_release);  //調(diào)用kobject_release
    }
}
EXPORT_SYMBOL(kobject_put);

static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
{
    return kref_sub(kref, 1, release);
}

static inline int kref_sub(struct kref *kref, unsigned int count,
         void (*release)(struct kref *kref))
{
    WARN_ON(release == NULL);

    if (atomic_sub_and_test((int) count, &kref->refcount)) {
        release(kref); //調(diào)用kobject_release
        return 1;
    }
    return 0;
}

根據(jù)上面的代碼追蹤,得知kobject_release才是釋放kobject的主角:

static void kobject_release(struct kref *kref)
{
    struct kobject *kobj = container_of(kref, struct kobject, kref);
#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
    unsigned long delay = HZ + HZ * (get_random_int() & 0x3);
    pr_info("kobject: '%s' (%p): %s, parent %p (delayed %ld)\n",
         kobject_name(kobj), kobj, __func__, kobj->parent, delay);
    INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup);
    //延遲調(diào)用kobject_delayed_cleanup進行清理
    schedule_delayed_work(&kobj->release, delay);
#else
    kobject_cleanup(kobj);  //清理
#endif
}

如果在內(nèi)核編譯時指定CONFIG_DEBUG_KOBJECT_RELEASE,則使用延遲release方式調(diào)用kobject_delayed_cleanup,否則直接調(diào)用kobject_cleanup。

#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
static void kobject_delayed_cleanup(struct work_struct *work)
{
    kobject_cleanup(container_of(to_delayed_work(work), //最終還是調(diào)用
                     struct kobject, release));
}
#endif

/*
 * kobject_cleanup - free kobject resources.
 * @kobj: object to cleanup
 */
static void kobject_cleanup(struct kobject *kobj)
{
    struct kobj_type *t = get_ktype(kobj);
    const char *name = kobj->name;

    pr_debug("kobject: '%s' (%p): %s, parent %p\n",
         kobject_name(kobj), kobj, __func__, kobj->parent);

    if (t && !t->release)
        pr_debug("kobject: '%s' (%p): does not have a release() "
             "function, it is broken and must be fixed.\n",
             kobject_name(kobj), kobj);

    /* send "remove" if the caller did not do it but sent "add" */
    if (kobj->state_add_uevent_sent && !kobj->state_remove_uevent_sent) {
        pr_debug("kobject: '%s' (%p): auto cleanup 'remove' event\n",
             kobject_name(kobj), kobj);
        kobject_uevent(kobj, KOBJ_REMOVE); //僅僅發(fā)送一次REMOVE消息
    }

    /* remove from sysfs if the caller did not do it */
    if (kobj->state_in_sysfs) {
        pr_debug("kobject: '%s' (%p): auto cleanup kobject_del\n",
             kobject_name(kobj), kobj);
        kobject_del(kobj); //如果調(diào)用者沒有清理sysfs,則清理
    }

    if (t && t->release) {
        pr_debug("kobject: '%s' (%p): calling ktype release\n",
             kobject_name(kobj), kobj);
        t->release(kobj); //調(diào)用kobj_type的release回調(diào)函數(shù)
    }

    /* free name if we allocated it */
    if (name) {
        pr_debug("kobject: '%s': free name\n", name);
        kfree_const(name);
    }
}

內(nèi)核對象集相關(guān)操作

void kset_init(struct kset *kset);
struct kset *kset_create(const char *name, const struct kset_uevent_ops *uevent_ops, struct kobject *parent_kobj);
int kset_register(struct kset *kset);
void kset_unregister(struct kset *kset);
struct kset * kset_create_and_add(const char *name, const struct kset_uevent_ops *u, struct kobject *parent_kobj);

內(nèi)核對象集創(chuàng)建及初始化

內(nèi)核對象集由kset_create創(chuàng)建

/**
 * kset_create - create a struct kset dynamically
 *
 * @name: the name for the kset
 * @uevent_ops: a struct kset_uevent_ops for the kset
 * @parent_kobj: the parent kobject of this kset, if any.
 *
 * This function creates a kset structure dynamically.  This structure can
 * then be registered with the system and show up in sysfs with a call to
 * kset_register().  When you are finished with this structure, if
 * kset_register() has been called, call kset_unregister() and the
 * structure will be dynamically freed when it is no longer being used.
 *
 * If the kset was not able to be created, NULL will be returned.
 */
static struct kset *kset_create(const char *name,
                const struct kset_uevent_ops *uevent_ops,
                struct kobject *parent_kobj)
{
    struct kset *kset;
    int retval;

    kset = kzalloc(sizeof(*kset), GFP_KERNEL);  //分配空間
    if (!kset)
        return NULL;
    retval = kobject_set_name(&kset->kobj, "%s", name); //設(shè)置kset在sysfs中的名字
    if (retval) {
        kfree(kset);
        return NULL;
    }
    kset->uevent_ops = uevent_ops;   //設(shè)置uevent_ops
    kset->kobj.parent = parent_kobj; //設(shè)置kset的父對象

    /*
     * The kobject of this kset will have a type of kset_ktype and belong to
     * no kset itself.  That way we can properly free it when it is
     * finished being used.
     */
    kset->kobj.ktype = &kset_ktype;  //設(shè)置kobj_type
    kset->kobj.kset = NULL;

    return kset;
}

內(nèi)核對象集由kset_init執(zhí)行初始化:

/**
 * kset_init - initialize a kset for use
 * @k: kset
 */
void kset_init(struct kset *k)
{
    kobject_init_internal(&k->kobj);  //這里初始化
    INIT_LIST_HEAD(&k->list);
    spin_lock_init(&k->list_lock);
}

static void kobject_init_internal(struct kobject *kobj)
{
    if (!kobj)
        return;
    kref_init(&kobj->kref);
    INIT_LIST_HEAD(&kobj->entry);
    kobj->state_in_sysfs = 0;        //設(shè)置對應標志位
    kobj->state_add_uevent_sent = 0;
    kobj->state_remove_uevent_sent = 0;
    kobj->state_initialized = 1;
}

初始化kset之后,調(diào)用kset_register,將kset添加到sysfs:

/**
 * kset_register - initialize and add a kset.
 * @k: kset.
 */
int kset_register(struct kset *k)
{
    int err;

    if (!k)
        return -EINVAL;

    kset_init(k);
    err = kobject_add_internal(&k->kobj); //完成register動作,前面已說明
    if (err)
        return err;
    kobject_uevent(&k->kobj, KOBJ_ADD); //發(fā)送ADD事件到用戶空間
    return 0;
}
EXPORT_SYMBOL(kset_register);

經(jīng)過kset_create, kset_init和kset_register之后,kset已初始化并添加完成。當然kset_create_and_add包含了這三個函數(shù)。

內(nèi)核對象集釋放

內(nèi)核對象的釋放過程與kobject的釋放過程類似,由kset_unregister完成:

/**
 * kset_unregister - remove a kset.
 * @k: kset.
 */
void kset_unregister(struct kset *k)
{
    if (!k)
        return;
    kobject_del(&k->kobj);  //刪除sysfs的目錄和屬性文件,前面已說明
    kobject_put(&k->kobj);  //與kobject釋放過程一致
}
EXPORT_SYMBOL(kset_unregister);

發(fā)送事件到用戶空間

由前面的代碼可以看到無論kobject或是kset,都會向用戶空間發(fā)送事件,由kobject_uevent函數(shù)通過設(shè)置環(huán)境變量的方式完成:

struct kobj_uevent_env {
    char *argv[3];                //user_helper使用的命令
    char *envp[UEVENT_NUM_ENVP];  //環(huán)境變量數(shù)組
    int envp_idx;                 //當前環(huán)境變量索引
    char buf[UEVENT_BUFFER_SIZE]; //環(huán)境變量數(shù)據(jù)緩沖區(qū)
    int buflen;
};

/**
 * kobject_uevent - notify userspace by sending an uevent
 *
 * @action: action that is happening
 * @kobj: struct kobject that the action is happening to
 *
 * Returns 0 if kobject_uevent() is completed with success or the
 * corresponding error when it fails.
 */
int kobject_uevent(struct kobject *kobj, enum kobject_action action)
{
    return kobject_uevent_env(kobj, action, NULL); //實際完成發(fā)送函數(shù)
}
EXPORT_SYMBOL_GPL(kobject_uevent);

/**
 * kobject_uevent_env - send an uevent with environmental data
 *
 * @action: action that is happening
 * @kobj: struct kobject that the action is happening to
 * @envp_ext: pointer to environmental data
 *
 * Returns 0 if kobject_uevent_env() is completed with success or the
 * corresponding error when it fails.
 */
int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
               char *envp_ext[])
{
    struct kobj_uevent_env *env;
    const char *action_string = kobject_actions[action];
    const char *devpath = NULL;
    const char *subsystem;
    struct kobject *top_kobj;
    struct kset *kset;
    const struct kset_uevent_ops *uevent_ops;
    int i = 0;
    int retval = 0;
#ifdef CONFIG_NET
    struct uevent_sock *ue_sk;
#endif

    pr_debug("kobject: '%s' (%p): %s\n",
         kobject_name(kobj), kobj, __func__);

    /* search the kset we belong to */
    top_kobj = kobj;
    while (!top_kobj->kset && top_kobj->parent)  //尋找最近的kset,kset中有鍀event_ops
        top_kobj = top_kobj->parent;

    if (!top_kobj->kset) {
        pr_debug("kobject: '%s' (%p): %s: attempted to send uevent "
             "without kset!\n", kobject_name(kobj), kobj,
             __func__);
        return -EINVAL;
    }

    kset = top_kobj->kset;
    uevent_ops = kset->uevent_ops;  //使用kset中的uevent_ops執(zhí)行發(fā)送操作

    /* skip the event, if uevent_suppress is set*/
    if (kobj->uevent_suppress) {  //跳過設(shè)置為uevent_suppress的kobject
        pr_debug("kobject: '%s' (%p): %s: uevent_suppress "
                 "caused the event to drop!\n",
                 kobject_name(kobj), kobj, __func__);
        return 0;
    }
    /* skip the event, if the filter returns zero. */
    if (uevent_ops && uevent_ops->filter)  //調(diào)用uevent_ops的filter函數(shù)
        if (!uevent_ops->filter(kset, kobj)) {
            pr_debug("kobject: '%s' (%p): %s: filter function "
                 "caused the event to drop!\n",
                 kobject_name(kobj), kobj, __func__);
            return 0;
        }

    /* originating subsystem */
    if (uevent_ops && uevent_ops->name)  //確定發(fā)送事件的kobject名字
        subsystem = uevent_ops->name(kset, kobj);
    else
        subsystem = kobject_name(&kset->kobj);
    if (!subsystem) {
        pr_debug("kobject: '%s' (%p): %s: unset subsystem caused the "
             "event to drop!\n", kobject_name(kobj), kobj,
             __func__);
        return 0;
    }

    /* environment buffer */
    env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); //分配kobj_uevent_env
    if (!env)
        return -ENOMEM;

    /* complete object path */
    devpath = kobject_get_path(kobj, GFP_KERNEL);
    if (!devpath) {
        retval = -ENOENT;
        goto exit;
    }

    /* default keys 添加環(huán)境變量 */
    retval = add_uevent_var(env, "ACTION=%s", action_string);
    if (retval)
        goto exit;
    retval = add_uevent_var(env, "DEVPATH=%s", devpath);
    if (retval)
        goto exit;
    retval = add_uevent_var(env, "SUBSYSTEM=%s", subsystem);
    if (retval)
        goto exit;

    /* keys passed in from the caller */
    if (envp_ext) {
        for (i = 0; envp_ext[i]; i++) {
            retval = add_uevent_var(env, "%s", envp_ext[i]);
            if (retval)
                goto exit;
        }
    }

    /* let the kset specific function add its stuff */
    if (uevent_ops && uevent_ops->uevent) { //調(diào)用uevent回調(diào)函數(shù),添加子系統(tǒng)特定的環(huán)境變量
        retval = uevent_ops->uevent(kset, kobj, env);
        if (retval) {
            pr_debug("kobject: '%s' (%p): %s: uevent() returned "
                 "%d\n", kobject_name(kobj), kobj,
                 __func__, retval);
            goto exit;
        }
    }

    /*
     * Mark "add" and "remove" events in the object to ensure proper
     * events to userspace during automatic cleanup. If the object did
     * send an "add" event, "remove" will automatically generated by
     * the core, if not already done by the caller.
     */
    if (action == KOBJ_ADD)
        kobj->state_add_uevent_sent = 1;
    else if (action == KOBJ_REMOVE)
        kobj->state_remove_uevent_sent = 1;

    mutex_lock(&uevent_sock_mutex);
    /* we will send an event, so request a new sequence number */
    retval = add_uevent_var(env, "SEQNUM=%llu", (unsigned long long)++uevent_seqnum);
    if (retval) {
        mutex_unlock(&uevent_sock_mutex);
        goto exit;
    }

#if defined(CONFIG_NET)  //如果在編譯時指定CONFIG_NET,使用netlink發(fā)送
    /* send netlink message */
    list_for_each_entry(ue_sk, &uevent_sock_list, list) {
        struct sock *uevent_sock = ue_sk->sk;
        struct sk_buff *skb;
        size_t len;

        if (!netlink_has_listeners(uevent_sock, 1))
            continue;

        /* allocate message with the maximum possible size */
        len = strlen(action_string) + strlen(devpath) + 2;
        skb = alloc_skb(len + env->buflen, GFP_KERNEL);
        if (skb) {
            char *scratch;

            /* add header */
            scratch = skb_put(skb, len);
            sprintf(scratch, "%s@%s", action_string, devpath);

            /* copy keys to our continuous event payload buffer */
            for (i = 0; i < env->envp_idx; i++) {
                len = strlen(env->envp[i]) + 1;
                scratch = skb_put(skb, len);
                strcpy(scratch, env->envp[i]);
            }

            NETLINK_CB(skb).dst_group = 1;
            retval = netlink_broadcast_filtered(uevent_sock, skb, //使用netlink多播發(fā)送
                                0, 1, GFP_KERNEL,
                                kobj_bcast_filter,
                                kobj);
            /* ENOBUFS should be handled in userspace */
            if (retval == -ENOBUFS || retval == -ESRCH)
                retval = 0;
        } else
            retval = -ENOMEM;
    }
#endif
    mutex_unlock(&uevent_sock_mutex);

#ifdef CONFIG_UEVENT_HELPER  //不能使用netlink時,使用user_helper發(fā)送
    /* call uevent_helper, usually only enabled during early boot */
    if (uevent_helper[0] && !kobj_usermode_filter(kobj)) {
        struct subprocess_info *info;

        retval = add_uevent_var(env, "HOME=/");
        if (retval)
            goto exit;
        retval = add_uevent_var(env,
                    "PATH=/sbin:/bin:/usr/sbin:/usr/bin");
        if (retval)
            goto exit;
        retval = init_uevent_argv(env, subsystem); //組裝需要調(diào)用的用戶空間命令和參數(shù)
        if (retval)
            goto exit;

        retval = -ENOMEM;
        info = call_usermodehelper_setup(env->argv[0], env->argv,  //調(diào)用用戶空間程序/sbin/hotplug
                         env->envp, GFP_KERNEL,
                         NULL, cleanup_uevent_env, env);
        if (info) {
            retval = call_usermodehelper_exec(info, UMH_NO_WAIT);
            env = NULL; /* freed by cleanup_uevent_env */
        }
    }
#endif

exit:
    kfree(devpath);
    kfree(env);
    return retval;
}
EXPORT_SYMBOL_GPL(kobject_uevent_env);

sysfs與內(nèi)核對象

本篇文章不是以文件系統(tǒng)的角度來詳細描述sysfs,而是從內(nèi)核對象如何通過sysfs表示整個設(shè)備驅(qū)動模型為切入點,進一步理解Linux內(nèi)核對象。

內(nèi)核對象添加到sysfs

在上文《內(nèi)核對象與對象集》中,將kobject添加到sysfs中,kobject_add –> kobject_add_varg –> kobject_add_internal,調(diào)用create_dir創(chuàng)建sysfs目錄和屬性文件。

static int create_dir(struct kobject *kobj)
{
    const struct kobj_ns_type_operations *ops;
    int error;
        //調(diào)用sysfs接口創(chuàng)建kobject對應的目錄
    error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj));
    if (error)
        return error;

    error = populate_dir(kobj);  //在kobject對應的目錄中生成默認屬性文件
    if (error) {
        sysfs_remove_dir(kobj);
        return error;
    }

    /*
     * @kobj->sd may be deleted by an ancestor going away.  Hold an
     * extra reference so that it stays until @kobj is gone.
     */
    sysfs_get(kobj->sd);

    /*
     * If @kobj has ns_ops, its children need to be filtered based on
     * their namespace tags.  Enable namespace support on @kobj->sd.
     */
    ops = kobj_child_ns_ops(kobj);
    if (ops) {
        BUG_ON(ops->type <= KOBJ_NS_TYPE_NONE);
        BUG_ON(ops->type >= KOBJ_NS_TYPES);
        BUG_ON(!kobj_ns_type_registered(ops->type));

        sysfs_enable_ns(kobj->sd);
    }

    return 0;
}

/*
 * populate_dir - populate directory with attributes.
 * @kobj: object we're working on.
 *
 * Most subsystems have a set of default attributes that are associated
 * with an object that registers with them.  This is a helper called during
 * object registration that loops through the default attributes of the
 * subsystem and creates attributes files for them in sysfs.
 */
static int populate_dir(struct kobject *kobj)
{
    struct kobj_type *t = get_ktype(kobj);
    struct attribute *attr;
    int error = 0;
    int i;

    if (t && t->default_attrs) {
        for (i = 0; (attr = t->default_attrs[i]) != NULL; i++) {
            error = sysfs_create_file(kobj, attr); //為每個屬性創(chuàng)建對應的文件
            if (error)
                break;
        }
    }
    return error;
}

create_dir通過調(diào)用sysfs_create_dir_ns創(chuàng)建sysfs中的目錄,調(diào)用sysfs_create_file創(chuàng)建屬性文件。

sysfs的核心結(jié)構(gòu)

kern_node代表sysfs中每個節(jié)點。

/*
 * kernfs_node - the building block of kernfs hierarchy.  Each and every
 * kernfs node is represented by single kernfs_node.  Most fields are
 * private to kernfs and shouldn't be accessed directly by kernfs users.
 *
 * As long as s_count reference is held, the kernfs_node itself is
 * accessible.  Dereferencing elem or any other outer entity requires
 * active reference.
 */
struct kernfs_node {
    atomic_t        count;   //引用計數(shù)
    atomic_t        active;  //活動的引用計數(shù)
#ifdef CONFIG_DEBUG_LOCK_ALLOC
    struct lockdep_map  dep_map;
#endif
    /*
     * Use kernfs_get_parent() and kernfs_name/path() instead of
     * accessing the following two fields directly.  If the node is
     * never moved to a different parent, it is safe to access the
     * parent directly.
     */
    struct kernfs_node  *parent; //指向父節(jié)點
    const char      *name;       //節(jié)點名稱,在sysfs顯示的名字

    struct rb_node      rb;      //接入sysfs紅黑樹的鏈接項

    const void      *ns;    /* namespace tag */
    unsigned int        hash;   /* ns + name hash 紅黑樹key */
    union {
        struct kernfs_elem_dir      dir;     //該kern_node類型為目錄
        struct kernfs_elem_symlink  symlink; //該kern_node類型為鏈接
        struct kernfs_elem_attr     attr;    //該kern_node類型為屬性文件
    };

    void            *priv;

    unsigned short      flags; //標記位,目錄、鏈接、屬性文件或是否已被刪除
    umode_t         mode;      //訪問權(quán)限,在sysfs中該kern_node的權(quán)限
    unsigned int        ino;   //唯一編號
    struct kernfs_iattrs    *iattr;  //用于設(shè)置非默認的inode屬性,如果沒有則置為NULL
};

在sysfs中創(chuàng)建目錄sysfs_create_dir_ns

/**
 * sysfs_create_dir_ns - create a directory for an object with a namespace tag
 * @kobj: object we're creating directory for
 * @ns: the namespace tag to use
 */
int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
{
    struct kernfs_node *parent, *kn;

    BUG_ON(!kobj);

    if (kobj->parent)
        parent = kobj->parent->sd; //如果kobject設(shè)置parent,則使用之
    else
        parent = sysfs_root_kn;  //否則parent就設(shè)置為sysfs根目錄

    if (!parent)
        return -ENOENT;
    //創(chuàng)建目錄
    kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
                  S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns);
    if (IS_ERR(kn)) {
        if (PTR_ERR(kn) == -EEXIST)
            sysfs_warn_dup(parent, kobject_name(kobj));
        return PTR_ERR(kn);
    }

    kobj->sd = kn;
    return 0;
}

/**
 * kernfs_create_dir_ns - create a directory
 * @parent: parent in which to create a new directory
 * @name: name of the new directory
 * @mode: mode of the new directory
 * @priv: opaque data associated with the new directory
 * @ns: optional namespace tag of the directory
 *
 * Returns the created node on success, ERR_PTR() value on failure.
 */
struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
                     const char *name, umode_t mode,
                     void *priv, const void *ns)
{
    struct kernfs_node *kn;
    int rc;

    /* allocate 分配空間并初始化, KERNFS_DIR指定創(chuàng)建目錄 */
    kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR);
    if (!kn)
        return ERR_PTR(-ENOMEM);

    kn->dir.root = parent->dir.root; //指向根目錄kern_node
    kn->ns = ns;  //指定命名空間
    kn->priv = priv;

    /* link in */
    rc = kernfs_add_one(kn); //將kern_node加入父目錄的紅黑樹中
    if (!rc)
        return kn;

    kernfs_put(kn);
    return ERR_PTR(rc);
}

kernfs_create_dir_ns函數(shù)中的兩個主要函數(shù)kernfs_new_node和kernfs_add_one,在創(chuàng)建文件和創(chuàng)建符號鏈接同樣使用,僅是參數(shù)不同。

為kern_node結(jié)構(gòu)分配空間,并初始化

struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
                    const char *name, umode_t mode,
                    unsigned flags)
{
    struct kernfs_node *kn;
    //分配kern_node空間,并初始化
    kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags);
    if (kn) {
        kernfs_get(parent);
        kn->parent = parent;
    }
    return kn;
}

static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
                         const char *name, umode_t mode,
                         unsigned flags)
{
    struct kernfs_node *kn;
    int ret;

    name = kstrdup_const(name, GFP_KERNEL); //復制常量字符串
    if (!name)
        return NULL;

    kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); //在緩存空間分配kernfs_node
    if (!kn)
        goto err_out1;

    /*
     * If the ino of the sysfs entry created for a kmem cache gets
     * allocated from an ida layer, which is accounted to the memcg that
     * owns the cache, the memcg will get pinned forever. So do not account
     * ino ida allocations.
     */
    ret = ida_simple_get(&root->ino_ida, 1, 0,  //獲取唯一標號,用于唯一標示kern_node
                 GFP_KERNEL | __GFP_NOACCOUNT);
    if (ret < 0)
        goto err_out2;
    kn->ino = ret;

    atomic_set(&kn->count, 1);  //更新引用計數(shù)
    atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
    RB_CLEAR_NODE(&kn->rb);
    //設(shè)置kern_node相關(guān)域
    kn->name = name;
    kn->mode = mode;
    kn->flags = flags;

    return kn;

 err_out2:
    kmem_cache_free(kernfs_node_cache, kn);
 err_out1:
    kfree_const(name);
    return NULL;
}

將kern_node添加到parent的紅黑樹中:

/**
 *  kernfs_add_one - add kernfs_node to parent without warning
 *  @kn: kernfs_node to be added
 *
 *  The caller must already have initialized @kn->parent.  This
 *  function increments nlink of the parent's inode if @kn is a
 *  directory and link into the children list of the parent.
 *
 *  RETURNS:
 *  0 on success, -EEXIST if entry with the given name already
 *  exists.
 */
int kernfs_add_one(struct kernfs_node *kn)
{
    struct kernfs_node *parent = kn->parent;
    struct kernfs_iattrs *ps_iattr;
    bool has_ns;
    int ret;

    mutex_lock(&kernfs_mutex);

    ret = -EINVAL;
    has_ns = kernfs_ns_enabled(parent);
    if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
         has_ns ? "required" : "invalid", parent->name, kn->name))
        goto out_unlock;

    if (kernfs_type(parent) != KERNFS_DIR) //檢查parent是否為目錄
        goto out_unlock;

    ret = -ENOENT;
    if (parent->flags & KERNFS_EMPTY_DIR)  //檢查parent是否為空目錄
        goto out_unlock;
    //檢查parent是否是active狀態(tài)
    if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
        goto out_unlock;

    kn->hash = kernfs_name_hash(kn->name, kn->ns); //作為紅黑樹比較的key

    ret = kernfs_link_sibling(kn); //kern_node鏈入parent節(jié)點紅黑樹中
    if (ret)
        goto out_unlock;

    /* Update timestamps on the parent */
    ps_iattr = parent->iattr;
    if (ps_iattr) {
        struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
        ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
    }

    mutex_unlock(&kernfs_mutex);

    /*
     * Activate the new node unless CREATE_DEACTIVATED is requested.
     * If not activated here, the kernfs user is responsible for
     * activating the node with kernfs_activate().  A node which hasn't
     * been activated is not visible to userland and its removal won't
     * trigger deactivation.
     */
    if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
        kernfs_activate(kn);
    return 0;

out_unlock:
    mutex_unlock(&kernfs_mutex);
    return ret;
}

sysfs紅黑樹中的key:

/**
 *  kernfs_name_hash
 *  @name: Null terminated string to hash
 *  @ns:   Namespace tag to hash
 *
 *  Returns 31 bit hash of ns + name (so it fits in an off_t )
 */
static unsigned int kernfs_name_hash(const char *name, const void *ns)
{
    unsigned long hash = init_name_hash();
    unsigned int len = strlen(name);
    while (len--)
        hash = partial_name_hash(*name++, hash);
    hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
    hash &= 0x7fffffffU;
    /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
    if (hash < 2)
        hash += 2;
    if (hash >= INT_MAX)
        hash = INT_MAX - 1;
    return hash;
}

static int kernfs_name_compare(unsigned int hash, const char *name,
                   const void *ns, const struct kernfs_node *kn)
{
    if (hash < kn->hash)
        return -1;
    if (hash > kn->hash)
        return 1;
    if (ns < kn->ns)
        return -1;
    if (ns > kn->ns)
        return 1;
    return strcmp(name, kn->name);
}
  • kernfs_name_hash: 根據(jù)name和ns計算kern_node的hash值,保存在kern_node.hash域中。

  • kernfs_name_compare: sysfs紅黑樹key的比較函數(shù), 比較優(yōu)先級是: hash > ns > name

kern_node鏈入parent節(jié)點紅黑樹中:

/**
 *  kernfs_link_sibling - link kernfs_node into sibling rbtree
 *  @kn: kernfs_node of interest
 *
 *  Link @kn into its sibling rbtree which starts from
 *  @kn->parent->dir.children.
 *
 *  Locking:
 *  mutex_lock(kernfs_mutex)
 *
 *  RETURNS:
 *  0 on susccess -EEXIST on failure.
 */
static int kernfs_link_sibling(struct kernfs_node *kn)
{
    struct rb_node **node = &kn->parent->dir.children.rb_node; //parent目錄的紅黑樹
    struct rb_node *parent = NULL;

    while (*node) {  //在parent的目錄中,尋找合適的位置將kn插入parent的紅黑樹中
        struct kernfs_node *pos;
        int result;

        pos = rb_to_kn(*node);
        parent = *node;
        result = kernfs_sd_compare(kn, pos); //優(yōu)先順序: hash > ns > name
        if (result < 0)
            node = &pos->rb.rb_left;
        else if (result > 0)
            node = &pos->rb.rb_right;
        else
            return -EEXIST;
    }

    /* add new node and rebalance the tree */
    rb_link_node(&kn->rb, parent, node);
    rb_insert_color(&kn->rb, &kn->parent->dir.children);

    /* successfully added, account subdir number */
    if (kernfs_type(kn) == KERNFS_DIR)
        kn->parent->dir.subdirs++;

    return 0;
}

在sysfs中創(chuàng)建文件

static inline int __must_check sysfs_create_file(struct kobject *kobj,
                         const struct attribute *attr)
{
    return sysfs_create_file_ns(kobj, attr, NULL);
}

/**
 * sysfs_create_file_ns - create an attribute file for an object with custom ns
 * @kobj: object we're creating for
 * @attr: attribute descriptor
 * @ns: namespace the new file should belong to
 */
int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
             const void *ns)
{
    BUG_ON(!kobj || !kobj->sd || !attr);

    return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns);

}
EXPORT_SYMBOL_GPL(sysfs_create_file_ns);

int sysfs_add_file_mode_ns(struct kernfs_node *parent,
               const struct attribute *attr, bool is_bin,
               umode_t mode, const void *ns)
{
    struct lock_class_key *key = NULL;
    const struct kernfs_ops *ops;
    struct kernfs_node *kn;
    loff_t size;

    if (!is_bin) {
        struct kobject *kobj = parent->priv;
        const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;

        /* every kobject with an attribute needs a ktype assigned */
        if (WARN(!sysfs_ops, KERN_ERR
             "missing sysfs attribute operations for kobject: %s\n",
             kobject_name(kobj)))
            return -EINVAL;
        //確定讀寫的操作函數(shù)
        if (sysfs_ops->show && sysfs_ops->store) {
            if (mode & SYSFS_PREALLOC)
                ops = &sysfs_prealloc_kfops_rw;
            else
                ops = &sysfs_file_kfops_rw;
        } else if (sysfs_ops->show) {
            if (mode & SYSFS_PREALLOC)
                ops = &sysfs_prealloc_kfops_ro;
            else
                ops = &sysfs_file_kfops_ro;
        } else if (sysfs_ops->store) {
            if (mode & SYSFS_PREALLOC)
                ops = &sysfs_prealloc_kfops_wo;
            else
                ops = &sysfs_file_kfops_wo;
        } else
            ops = &sysfs_file_kfops_empty;

        size = PAGE_SIZE;
    } else {
        struct bin_attribute *battr = (void *)attr;

        if (battr->mmap)
            ops = &sysfs_bin_kfops_mmap;
        else if (battr->read && battr->write)
            ops = &sysfs_bin_kfops_rw;
        else if (battr->read)
            ops = &sysfs_bin_kfops_ro;
        else if (battr->write)
            ops = &sysfs_bin_kfops_wo;
        else
            ops = &sysfs_file_kfops_empty;

        size = battr->size;
    }

#ifdef CONFIG_DEBUG_LOCK_ALLOC
    if (!attr->ignore_lockdep)
        key = attr->key ?: (struct lock_class_key *)&attr->skey;
#endif
    kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops,
                  (void *)attr, ns, key); //創(chuàng)建屬性文件
    if (IS_ERR(kn)) {
        if (PTR_ERR(kn) == -EEXIST)
            sysfs_warn_dup(parent, attr->name);
        return PTR_ERR(kn);
    }
    return 0;
}

通過上面的代碼跟蹤,創(chuàng)建屬性文件由__kernfs_create_file實現(xiàn),最終仍然是調(diào)用kernfs_new_node和kernfs_add_one。

/**
 * __kernfs_create_file - kernfs internal function to create a file
 * @parent: directory to create the file in
 * @name: name of the file
 * @mode: mode of the file
 * @size: size of the file
 * @ops: kernfs operations for the file
 * @priv: private data for the file
 * @ns: optional namespace tag of the file
 * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
 *
 * Returns the created node on success, ERR_PTR() value on error.
 */
struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
                     const char *name,
                     umode_t mode, loff_t size,
                     const struct kernfs_ops *ops,
                     void *priv, const void *ns,
                     struct lock_class_key *key)
{
    struct kernfs_node *kn;
    unsigned flags;
    int rc;

    flags = KERNFS_FILE; //創(chuàng)建的kern_node類型為file
    //分配空間并初始化
    kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
    if (!kn)
        return ERR_PTR(-ENOMEM);

    kn->attr.ops = ops;
    kn->attr.size = size;
    kn->ns = ns;
    kn->priv = priv;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
    if (key) {
        lockdep_init_map(&kn->dep_map, "s_active", key, 0);
        kn->flags |= KERNFS_LOCKDEP;
    }
#endif

    /*
     * kn->attr.ops is accesible only while holding active ref.  We
     * need to know whether some ops are implemented outside active
     * ref.  Cache their existence in flags.
     */
    if (ops->seq_show)
        kn->flags |= KERNFS_HAS_SEQ_SHOW;
    if (ops->mmap)
        kn->flags |= KERNFS_HAS_MMAP;

    rc = kernfs_add_one(kn); //將kern_node添加到parent的紅黑樹中
    if (rc) {
        kernfs_put(kn);
        return ERR_PTR(rc);
    }
    return kn;
}

在sysfs_add_file_mode_ns函數(shù)中根據(jù)flags的不同,注冊不同的讀寫回調(diào)函數(shù),下面以sysfs_prealloc_kfops_rw函數(shù)為例,其他結(jié)構(gòu)類似,不贅述。

//常規(guī)文件--sysfs_prealloc_kfops_rw
static const struct kernfs_ops sysfs_prealloc_kfops_rw = {
    .read       = sysfs_kf_read,
    .write      = sysfs_kf_write,
    .prealloc   = true,
};

/* kernfs read callback for regular sysfs files with pre-alloc */
static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
                 size_t count, loff_t pos)
{
    const struct sysfs_ops *ops = sysfs_file_ops(of->kn); //獲取kobject中的sysfs_ops操作表
    struct kobject *kobj = of->kn->parent->priv;
    size_t len;

    /*
     * If buf != of->prealloc_buf, we don't know how
     * large it is, so cannot safely pass it to ->show
     */
    if (pos || WARN_ON_ONCE(buf != of->prealloc_buf))
        return 0;
    len = ops->show(kobj, of->kn->priv, buf); //kobject中sd域的sysfs_ops操作表中的show
    return min(count, len);
}

/* kernfs write callback for regular sysfs files */
static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
                  size_t count, loff_t pos)
{   //獲取kobject中的sysfs_ops操作表
    const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
    struct kobject *kobj = of->kn->parent->priv;

    if (!count)
        return 0;

    return ops->store(kobj, of->kn->priv, buf, count); //kobject中sd域的sysfs_ops操作表中的store
}

關(guān)于屬性文件的讀寫操作,最終都回調(diào)到kobject中的sd域的sysfs_ops操作表,這個操作表示在kobject_init函數(shù)中設(shè)置。回顧kobject_create函數(shù):

struct kobject *kobject_create(void)
{
    struct kobject *kobj;

    kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); //分配空間
    if (!kobj)
        return NULL;

    kobject_init(kobj, &dynamic_kobj_ktype);  //初始化, kobj_type類型為dynamic_kobj_ktype
    return kobj;
}

//注冊如下結(jié)構(gòu)
static struct kobj_type dynamic_kobj_ktype = {
    .release    = dynamic_kobj_release,
    .sysfs_ops  = &kobj_sysfs_ops,
};

const struct sysfs_ops kobj_sysfs_ops = {
    .show   = kobj_attr_show,
    .store  = kobj_attr_store,
};
EXPORT_SYMBOL_GPL(kobj_sysfs_ops);

kobject的sysfs的show和store方法為:kobj_attr_show和kobj_attr_store

static ssize_t kobj_attr_show(struct kobject *kobj, struct attribute *attr,
                  char *buf)
{
    struct kobj_attribute *kattr;
    ssize_t ret = -EIO;

    kattr = container_of(attr, struct kobj_attribute, attr);
    if (kattr->show)  //如果業(yè)務(wù)子系統(tǒng)設(shè)置了show函數(shù),則調(diào)用
        ret = kattr->show(kobj, kattr, buf);
    return ret;
}

static ssize_t kobj_attr_store(struct kobject *kobj, struct attribute *attr,
                   const char *buf, size_t count)
{
    struct kobj_attribute *kattr;
    ssize_t ret = -EIO;

    kattr = container_of(attr, struct kobj_attribute, attr);
    if (kattr->store)  //如果業(yè)務(wù)子系統(tǒng)設(shè)置了store函數(shù),則調(diào)用
        ret = kattr->store(kobj, kattr, buf, count);
    return ret;
}

真正的對屬性文件進行讀寫的回調(diào)由業(yè)務(wù)子系統(tǒng)實現(xiàn)。

在sysfs中創(chuàng)建符號鏈接

/**
 *  sysfs_create_link - create symlink between two objects.
 *  @kobj:  object whose directory we're creating the link in.
 *  @target:    object we're pointing to.
 *  @name:      name of the symlink.
 */
int sysfs_create_link(struct kobject *kobj, struct kobject *target,
              const char *name)
{
    return sysfs_do_create_link(kobj, target, name, 1);
}
EXPORT_SYMBOL_GPL(sysfs_create_link);

static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
                const char *name, int warn)
{
    struct kernfs_node *parent = NULL;

    if (!kobj)
        parent = sysfs_root_kn;
    else
        parent = kobj->sd;

    if (!parent)
        return -EFAULT;

    return sysfs_do_create_link_sd(parent, target, name, warn);
}

static int sysfs_do_create_link_sd(struct kernfs_node *parent,
                   struct kobject *target_kobj,
                   const char *name, int warn)
{
    struct kernfs_node *kn, *target = NULL;

    BUG_ON(!name || !parent);

    /*
     * We don't own @target_kobj and it may be removed at any time.
     * Synchronize using sysfs_symlink_target_lock.  See
     * sysfs_remove_dir() for details.
     */
    spin_lock(&sysfs_symlink_target_lock);
    if (target_kobj->sd) {
        target = target_kobj->sd;
        kernfs_get(target);
    }
    spin_unlock(&sysfs_symlink_target_lock);

    if (!target)
        return -ENOENT;

    kn = kernfs_create_link(parent, name, target); //創(chuàng)建sysfs符號鏈接
    kernfs_put(target);

    if (!IS_ERR(kn))
        return 0;

    if (warn && PTR_ERR(kn) == -EEXIST)
        sysfs_warn_dup(parent, name);
    return PTR_ERR(kn);
}

由上面的代碼追蹤,創(chuàng)建符號鏈接由kernfs_create_link函數(shù)上。

/**
 * kernfs_create_link - create a symlink
 * @parent: directory to create the symlink in
 * @name: name of the symlink
 * @target: target node for the symlink to point to
 *
 * Returns the created node on success, ERR_PTR() value on error.
 */
struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
                       const char *name,
                       struct kernfs_node *target)
{
    struct kernfs_node *kn;
    int error;
    //指定創(chuàng)建符號鏈接
    kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK);
    if (!kn)
        return ERR_PTR(-ENOMEM);

    if (kernfs_ns_enabled(parent))
        kn->ns = target->ns;
    kn->symlink.target_kn = target;
    kernfs_get(target); /* ref owned by symlink */

    error = kernfs_add_one(kn); //將kern_node添加到parent的紅黑樹中
    if (!error)
        return kn;

    kernfs_put(kn);
    return ERR_PTR(error);
}

與創(chuàng)建目錄和文件類似,最終仍然是調(diào)用kernfs_new_node和kernfs_add_one實現(xiàn)。

基于內(nèi)核對象編程套路

目標:在sysfs中創(chuàng)建一個目錄/sys/kernel/storage/,在該目錄下,還創(chuàng)建了一個文件value。value可以寫入整型數(shù)據(jù),隨后可以讀出。 
* 定義內(nèi)核對象

struct storage_obj {
    struct kobject kobj;
    int val;  //用于保存寫入的數(shù)據(jù)
};
  • 定義屬性類型

struct storage_attribute {
    struct attribute *attr;
    ssize_t (*show)(struct kobject *, struct attribute *, char *);
    ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t);
}
  • 聲明屬性 
    定義屬性的show和store方法,如下:

//定義并初始化storage_attribute
struct storage_attribute *sattr = &struct storage_attribute {
    .attr = {.name = "value", .mode = 0666},
    .show = storage_show,
    .store = storage_store,
};
  • 實現(xiàn)sysfs操作

ssize_t storage_show(struct kobject *kobj, struct attribute *attr, char *buf) 
{
    struct storage *stor = container_of(kobj, struct storage_obj, kobj);
    stor->val = atoi(buf);
}

ssize_t storage_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t s) {
    struct storage *stor = container_of(kobj, struct storage_obj, kobj);
    memcpy(buf, s, itoa(stor->val));
}
  • 定義內(nèi)核對象release方法 
    release方法設(shè)置在kobj_type結(jié)構(gòu)中

void storage_release(struct kobject *kobj)
{
    ......
}
  • 聲明內(nèi)核對象類型

struct storage_ktype {
    struct kobj_type *ktype;
}
  • 封裝對象屬性添加和刪除方法 
    需要將value屬性添加到內(nèi)核對象,或者從內(nèi)核對象刪除,可以直接調(diào)用sysfs_create_file和sysfs_remove_file。但大多數(shù)情況下,會對這兩個方法做一層封裝:storage_create_file和storage_remove_file。

int storage_create_file(struct storage_obj *sobj, const struct storage_attribute *attr)
{
    int error = 0;
    if (sobj) {
        error = sysfs_create_file(&sobj->kobj, &attr->attr);
    }
    return error;
}

void storage_remove_file(struct storage_obj *sobj, const struct storage_attribute *attr)
{
    if (sobj) {
        sysfs_remove_file(&sobj->kobj, &attr->attr);
    }
}
  • 定義對象的創(chuàng)建和銷毀方法

struct storage_obj * create_storage_obj() 
{
    struct storage_obj *sobj = (struct storage_obj *)malloc(struct storage_obj);
    struct storage_ktype *stype = (struct storage_ktype *)malloc(struct storage_ktype);
    sobj->parent = kernel_kobj;
    kobject_init_and_add(&sobj->kobj, &stype->ktype);

    return sobj
}

void destroy_storage_obj(struct kobject *kobj) {
    struct storage_obj *sobj = container_of(kobj, struct storage_obj, kobj);

    kobject_del(kboj);
    free(sobj);
    free(stype);
}
  • 實現(xiàn)模塊加載和卸載方法 
    加載時調(diào)用create_storage_obj, 卸載時調(diào)用destroy_storage_obj


設(shè)備驅(qū)動模型

概述

Linux的設(shè)備驅(qū)動模型能夠帶來以下的優(yōu)點: 
* 使用統(tǒng)一機制來表達設(shè)備與驅(qū)動之間的關(guān)系,規(guī)范設(shè)備驅(qū)動的編寫,核心代碼復用。 
* 將系統(tǒng)中的設(shè)備以樹結(jié)構(gòu)組織,并且通過sysfs將其呈現(xiàn)在用戶空間——包括所有的總線和內(nèi)部連接。 
* 支持設(shè)備的熱拔插機制。 
* 支持通用的電源管理機制,通過由葉子節(jié)點到根節(jié)點的方向遍歷設(shè)備樹,確保子設(shè)備在父設(shè)備之前斷電。

內(nèi)核基于內(nèi)核對象和sysfs,通過抽象以下五種概念,實現(xiàn)了設(shè)備驅(qū)動模型的框架,使得編寫子系統(tǒng)成為“八股文”。 
1. bus_type: 總線類型,每個子系統(tǒng)有且只有一個總線類型,由bus_type和subsys_private兩個結(jié)構(gòu)共同描述。 
2. device: 設(shè)備,描述掛在總線類型中的設(shè)備,由device和device_private兩個結(jié)構(gòu)共同描述。 
3. driver: 驅(qū)動, 描述掛在總線類型中的驅(qū)動模塊,由device_driver和driver_private兩個結(jié)構(gòu)共同描述。 
4. class: 類,每個總線類型有且只有一個類,由class和subsys_private兩個結(jié)構(gòu)共同描述。 
5. class_interface: 接口,每個類有多個接口,由class_interface結(jié)構(gòu)描述。

在Linux內(nèi)核中,子系統(tǒng)是由bus_type, device, driver, class和class_interface之間的關(guān)系所描述,而設(shè)備驅(qū)動模型正是這些關(guān)系的核心實現(xiàn),使得在編寫子系統(tǒng)程序時,只要遵循設(shè)備模型的套路,便不需要關(guān)注于這些復雜的關(guān)系,只需實現(xiàn)自身的業(yè)務(wù)邏輯。

每個子系統(tǒng)都有一個總線類型,總線類型擁有一個設(shè)備鏈表和一個驅(qū)動鏈表,用于連接由該總線類型已發(fā)現(xiàn)的設(shè)備和已加載的驅(qū)動,設(shè)備發(fā)現(xiàn)和驅(qū)動加載的順序是任意的。每個設(shè)備最多綁定到一個驅(qū)動,被綁定了驅(qū)動的設(shè)備可以正常工作。除此之外,每個設(shè)備可以唯一屬于某個類,類中包含多個接口,接口的方法作用于設(shè)備,不管是先添加接口,還是先發(fā)現(xiàn)設(shè)備。

總線類型

總線類型的數(shù)據(jù)結(jié)構(gòu)

struct bus_type {
    const char      *name;         //子系統(tǒng)名稱
    const char      *dev_name;     //供子系統(tǒng)生成設(shè)備名稱使用
    struct device       *dev_root;
    struct device_attribute *dev_attrs; /* use dev_groups instead */
    const struct attribute_group **bus_groups;  //總線類型使用的屬性組
    const struct attribute_group **dev_groups;  //設(shè)備使用的屬性組
    const struct attribute_group **drv_groups;  //驅(qū)動使用的屬性組

    int (*match)(struct device *dev, struct device_driver *drv);    //檢測設(shè)備與驅(qū)動是否可以綁定
    int (*uevent)(struct device *dev, struct kobj_uevent_env *env); //發(fā)送事件前,設(shè)置bus特有的環(huán)境變量
    int (*probe)(struct device *dev);     //當設(shè)備可以綁定到驅(qū)動時,對設(shè)備進行初始化和執(zhí)行綁定
    int (*remove)(struct device *dev);    //當設(shè)備從驅(qū)動中解綁時,回調(diào)
    void (*shutdown)(struct device *dev); //當設(shè)備斷電時,回調(diào)

    int (*online)(struct device *dev);    //當設(shè)備上電時,回調(diào)
    int (*offline)(struct device *dev);   //當設(shè)備下電時,回調(diào)

    int (*suspend)(struct device *dev, pm_message_t state); //當設(shè)備進入節(jié)能狀態(tài)時,回調(diào)
    int (*resume)(struct device *dev);                      //當設(shè)備恢復正常狀態(tài)時,回調(diào)

    const struct dev_pm_ops *pm;  //電源管理相關(guān)

    const struct iommu_ops *iommu_ops;

    struct subsys_private *p;         //子系統(tǒng)私有類型
    struct lock_class_key lock_key;
};

struct subsys_private {
    struct kset subsys;          //總線kset,scsi子系統(tǒng)對應/sys/bus/scsi
    struct kset *devices_kset;   //設(shè)備kset, scsi子系統(tǒng)對應/sys/bus/scsi/devices
    struct list_head interfaces; //總線的接口鏈表
    struct mutex mutex;          

    struct kset *drivers_kset;   //驅(qū)動kset, scsi子系統(tǒng)對應/sys/bus/scsi/drivers
    struct klist klist_devices;  //總線的設(shè)備鏈表
    struct klist klist_drivers;  //總線的驅(qū)動鏈表
    struct blocking_notifier_head bus_notifier; //子系統(tǒng)變化時,需要通知的鏈表
    unsigned int drivers_autoprobe:1;  //是否允許設(shè)備或驅(qū)動加載時,自動探測
    struct bus_type *bus;        //指向總線類型

    struct kset glue_dirs;
    struct class *class;         //指向總線類型的類
};

從上面的兩個結(jié)構(gòu)可以看到,bus_type包含的主要是實現(xiàn)子系統(tǒng)應該具體關(guān)注的比如name,一組回調(diào)函數(shù)。而subsys_private結(jié)構(gòu)主要是設(shè)備驅(qū)動模型中的關(guān)系的表達,如字段subsys的類型是kset,描述該子系統(tǒng)在sysfs中的表達;klist_devices和klist_drivers分別是設(shè)備鏈表和驅(qū)動鏈表,用于管理總線類型的所有設(shè)備和驅(qū)動。之后仍然會遇到xxx_private的結(jié)構(gòu),以這種方式命名的結(jié)構(gòu),都是給設(shè)備驅(qū)動模型核心使用的,業(yè)務(wù)子系統(tǒng)無需也不能使用。

總線類型注冊/反注冊

實現(xiàn)子系統(tǒng)的第一步就是創(chuàng)建bus_type,并將其注冊到系統(tǒng),此時需要調(diào)用bus_register:

/**
 * bus_register - register a driver-core subsystem
 * @bus: bus to register
 *
 * Once we have that, we register the bus with the kobject
 * infrastructure, then register the children subsystems it has:
 * the devices and drivers that belong to the subsystem.
 */
int bus_register(struct bus_type *bus)
{
    int retval;
    struct subsys_private *priv;
    struct lock_class_key *key = &bus->lock_key;
    //分配總線類型私有數(shù)據(jù)空間
    priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL);
    if (!priv)
        return -ENOMEM;

    priv->bus = bus; //關(guān)聯(lián)bus_type和subsys_private
    bus->p = priv;

    BLOCKING_INIT_NOTIFIER_HEAD(&priv->bus_notifier);
    //設(shè)置總線類型名稱到kobject中,在sysfs中顯示
    retval = kobject_set_name(&priv->subsys.kobj, "%s", bus->name);
    if (retval)
        goto out;

    priv->subsys.kobj.kset = bus_kset;
    priv->subsys.kobj.ktype = &bus_ktype;
    priv->drivers_autoprobe = 1;    //開啟自動探測

    retval = kset_register(&priv->subsys);  //將總線類型添加到設(shè)備模型中
    if (retval)
        goto out;

    retval = bus_create_file(bus, &bus_attr_uevent); //創(chuàng)建uevent屬性文件
    if (retval)
        goto bus_uevent_fail;

    priv->devices_kset = kset_create_and_add("devices", NULL,  //創(chuàng)建devices目錄
                         &priv->subsys.kobj);
    if (!priv->devices_kset) {
        retval = -ENOMEM;
        goto bus_devices_fail;
    }

    priv->drivers_kset = kset_create_and_add("drivers", NULL,  //創(chuàng)建drivers目錄
                         &priv->subsys.kobj);
    if (!priv->drivers_kset) {
        retval = -ENOMEM;
        goto bus_drivers_fail;
    }
    //初始化鏈表和鎖
    INIT_LIST_HEAD(&priv->interfaces);
    __mutex_init(&priv->mutex, "subsys mutex", key);
    klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put);
    klist_init(&priv->klist_drivers, NULL, NULL);

    retval = add_probe_files(bus); //在sysfs中添加探測文件drivers_autoprobe和drivers_probe
    if (retval)
        goto bus_probe_files_fail;

    retval = bus_add_groups(bus, bus->bus_groups); //添加總線類型的屬性文件
    if (retval)
        goto bus_groups_fail;

    pr_debug("bus: '%s': registered\n", bus->name);
    return 0;
    //失敗回滾操作
bus_groups_fail:
    remove_probe_files(bus);
bus_probe_files_fail:
    kset_unregister(bus->p->drivers_kset);
bus_drivers_fail:
    kset_unregister(bus->p->devices_kset);
bus_devices_fail:
    bus_remove_file(bus, &bus_attr_uevent);
bus_uevent_fail:
    kset_unregister(&bus->p->subsys);
out:
    kfree(bus->p);
    bus->p = NULL;
    return retval;
}
EXPORT_SYMBOL_GPL(bus_register);

注冊總線類型后,便可以在系統(tǒng)看到:

root@ubuntu16:~# ls /sys/bus/scsi -l
total 0
drwxr-xr-x 2 root root    0 Sep  5 16:01 devices
drwxr-xr-x 4 root root    0 Sep  2 09:44 drivers
-rw-r--r-- 1 root root 4096 Sep  5 11:29 drivers_autoprobe
--w------- 1 root root 4096 Sep  5 11:29 drivers_probe
--w------- 1 root root 4096 Sep  2 09:44 uevent
root@ubuntu16:~#

當從系統(tǒng)中注銷子系統(tǒng)時,需要調(diào)用bus_unregister,完成總線類型的反注冊:

/**
 * bus_unregister - remove a bus from the system
 * @bus: bus.
 *
 * Unregister the child subsystems and the bus itself.
 * Finally, we call bus_put() to release the refcount
 */
void bus_unregister(struct bus_type *bus)
{
    pr_debug("bus: '%s': unregistering\n", bus->name);
    if (bus->dev_root)
        device_unregister(bus->dev_root);     //刪除根設(shè)備
    bus_remove_groups(bus, bus->bus_groups);  //刪除總線的屬性文件
    remove_probe_files(bus);                  //刪除探測文件drivers_autoprobe和drivers_probe
    kset_unregister(bus->p->drivers_kset);    //刪除drivers目錄
    kset_unregister(bus->p->devices_kset);    //刪除devices目錄
    bus_remove_file(bus, &bus_attr_uevent);   //刪除uevent文件
    kset_unregister(&bus->p->subsys);         //刪除總線目錄
}
EXPORT_SYMBOL_GPL(bus_unregister);

設(shè)備

設(shè)備的數(shù)據(jù)結(jié)構(gòu)

struct device {
    struct device       *parent;  //指向父設(shè)備,eg.HBA

    struct device_private   *p;   //設(shè)備私有指針

    struct kobject kobj;          //內(nèi)嵌kobject
    const char      *init_name; /* initial name of the device */
    const struct device_type *type;  //設(shè)備類型,抽象出來的域和方法

    struct mutex        mutex;  /* mutex to synchronize calls to its driver */

    struct bus_type *bus;       /* type of bus device is on; devive歸屬的bus */
    struct device_driver *driver;   /* which driver has allocated this device */
    void        *platform_data; /* Platform specific data, device core doesn't touch it */
    void        *driver_data;   /* Driver data, set and get with dev_set/get_drvdata */
    struct dev_pm_info  power;
    struct dev_pm_domain    *pm_domain;

#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
    struct irq_domain   *msi_domain;
#endif
#ifdef CONFIG_PINCTRL
    struct dev_pin_info *pins;
#endif
#ifdef CONFIG_GENERIC_MSI_IRQ
    struct list_head    msi_list;
#endif

#ifdef CONFIG_NUMA
    int     numa_node;  /* NUMA node this device is close to */
#endif
    u64     *dma_mask;  /* dma mask (if dma'able device) */
    u64     coherent_dma_mask;/* Like dma_mask, but for
                         alloc_coherent mappings as
                         not all hardware supports
                         64 bit addresses for consistent
                         allocations such descriptors. */
    unsigned long   dma_pfn_offset;

    struct device_dma_parameters *dma_parms;

    struct list_head    dma_pools;  /* dma pools (if dma'ble) */

    struct dma_coherent_mem *dma_mem; /* internal for coherent mem override */
#ifdef CONFIG_DMA_CMA
    struct cma *cma_area;       /* contiguous memory area for dma allocations */
#endif
    /* arch specific additions */
    struct dev_archdata archdata;

    struct device_node  *of_node; /* associated device tree node */
    struct fwnode_handle    *fwnode; /* firmware device node */

    dev_t           devt;   /* dev_t, creates the sysfs "dev"; 設(shè)備號 */
    u32         id; /* device instance */

    spinlock_t      devres_lock;
    struct list_head    devres_head; //設(shè)備資源鏈表頭

    struct klist_node   knode_class; //鏈入類的設(shè)備鏈表
    struct class        *class;      //指向鏈入的類
    const struct attribute_group **groups;  /* optional groups 設(shè)備特有的屬性 */

    void    (*release)(struct device *dev);  //設(shè)備是否回調(diào)
    struct iommu_group  *iommu_group;

    bool            offline_disabled:1;
    bool            offline:1;
};

struct device_private {
    struct klist klist_children;     //子設(shè)備鏈表
    struct klist_node knode_parent;  //鏈入父設(shè)備的children鏈表
    struct klist_node knode_driver;  //鏈入驅(qū)動的設(shè)備鏈表中
    struct klist_node knode_bus;     //鏈入總線的設(shè)備鏈表
    struct list_head deferred_probe; //鏈入延遲探測鏈表
    struct device *device;           //指向關(guān)聯(lián)的device
};

struct device_type {
    const char *name;  //設(shè)備類型的名稱
    const struct attribute_group **groups;  //設(shè)備的公有屬性組
    int (*uevent)(struct device *dev, struct kobj_uevent_env *env); //發(fā)送事件前調(diào)用,用于設(shè)置事件環(huán)境變量
    char *(*devnode)(struct device *dev, umode_t *mode, //在創(chuàng)建設(shè)備時,提供名字線索
             kuid_t *uid, kgid_t *gid);
    void (*release)(struct device *dev);    //設(shè)備釋放時回調(diào)

    const struct dev_pm_ops *pm;
};

在設(shè)備驅(qū)動模型中,device結(jié)構(gòu)有bus域,指向device所屬的總線類型;class域指向device所屬的唯一的類;driver域指向設(shè)備所綁定的驅(qū)動。與內(nèi)核對象一樣,設(shè)備也被組織層層次結(jié)構(gòu),通過parent指向父設(shè)備。

device_private結(jié)構(gòu)由設(shè)備驅(qū)動模型處理,維護和其他結(jié)構(gòu)之間的內(nèi)部關(guān)系。device_type結(jié)構(gòu)定義設(shè)備公有的屬性和方法。

設(shè)備的注冊與反注冊

當設(shè)備被發(fā)現(xiàn)后,需要將設(shè)備注冊到系統(tǒng),需要調(diào)用device_register函數(shù):

/**
 * device_register - register a device with the system.
 * @dev: pointer to the device structure
 *
 * This happens in two clean steps - initialize the device
 * and add it to the system. The two steps can be called
 * separately, but this is the easiest and most common.
 * I.e. you should only call the two helpers separately if
 * have a clearly defined need to use and refcount the device
 * before it is added to the hierarchy.
 *
 * For more information, see the kerneldoc for device_initialize()
 * and device_add().
 *
 * NOTE: _Never_ directly free @dev after calling this function, even
 * if it returned an error! Always use put_device() to give up the
 * reference initialized in this function instead.
 */
int device_register(struct device *dev)
{
    device_initialize(dev);  //初始化device結(jié)構(gòu)
    return device_add(dev);  //將設(shè)備添加到系統(tǒng)
}
EXPORT_SYMBOL_GPL(device_register);


void device_initialize(struct device *dev)
{
    dev->kobj.kset = devices_kset;             // /sys/devices/
    kobject_init(&dev->kobj, &device_ktype);   // device的類型為device_ktype
    INIT_LIST_HEAD(&dev->dma_pools);
    mutex_init(&dev->mutex);
    lockdep_set_novalidate_class(&dev->mutex);
    spin_lock_init(&dev->devres_lock);
    INIT_LIST_HEAD(&dev->devres_head);
    device_pm_init(dev);
    set_dev_node(dev, -1);
#ifdef CONFIG_GENERIC_MSI_IRQ
    INIT_LIST_HEAD(&dev->msi_list);
#endif
}
EXPORT_SYMBOL_GPL(device_initialize);

device_register函數(shù)調(diào)用device_initialize對device結(jié)構(gòu)進行初始化,調(diào)用device_add函數(shù)完成設(shè)備添加到系統(tǒng)。

int device_add(struct device *dev)
{
    struct device *parent = NULL;
    struct kobject *kobj;
    struct class_interface *class_intf;
    int error = -EINVAL;

    dev = get_device(dev);
    if (!dev)
        goto done;

    if (!dev->p) {  //如果device沒有設(shè)置devcie_private,在這里分配并初始化
        error = device_private_init(dev);
        if (error)
            goto done;
    }

    /*
     * for statically allocated devices, which should all be converted
     * some day, we need to initialize the name. We prevent reading back
     * the name, and force the use of dev_name()
     */
    if (dev->init_name) {
        dev_set_name(dev, "%s", dev->init_name); //設(shè)置device的kobject名字
        dev->init_name = NULL;
    }

    /* subsystems can specify simple device enumeration */
    if (!dev_name(dev) && dev->bus && dev->bus->dev_name) //如果device沒有設(shè)置init_name, 則使用bus的dev_name和設(shè)備id生成
        dev_set_name(dev, "%s%u", dev->bus->dev_name, dev->id);

    if (!dev_name(dev)) {
        error = -EINVAL;
        goto name_error;
    }

    pr_debug("device: '%s': %s\n", dev_name(dev), __func__);

    parent = get_device(dev->parent);
    kobj = get_device_parent(dev, parent);
    if (kobj)
        dev->kobj.parent = kobj;  //設(shè)置device的kobject的parent字段

    /* use parent numa_node */
    if (parent && (dev_to_node(dev) == NUMA_NO_NODE))
        set_dev_node(dev, dev_to_node(parent));

    /* first, register with generic layer. */
    /* we require the name to be set before, and pass NULL */
    error = kobject_add(&dev->kobj, dev->kobj.parent, NULL); //將device添加到parent的目錄中
    if (error)
        goto Error;

    /* notify platform of device entry */
    if (platform_notify)
        platform_notify(dev);

    error = device_create_file(dev, &dev_attr_uevent); //在設(shè)備目錄下創(chuàng)建uevent文件
    if (error)
        goto attrError;

    error = device_add_class_symlinks(dev); //為設(shè)備創(chuàng)建和類相關(guān)的符號鏈接
    if (error)
        goto SymlinkError;
    error = device_add_attrs(dev); //為設(shè)備的默認屬性添加對應的文件
    if (error)
        goto AttrsError;
    error = bus_add_device(dev);  //將device添加到bus_type
    if (error)
        goto BusError;
    error = dpm_sysfs_add(dev);
    if (error)
        goto DPMError;
    device_pm_add(dev);

    if (MAJOR(dev->devt)) {
        error = device_create_file(dev, &dev_attr_dev); //在設(shè)備目錄下創(chuàng)建dev屬性對應文件,用于保存設(shè)備號
        if (error)
            goto DevAttrError;

        error = device_create_sys_dev_entry(dev); //在/sys/block和/sys/char創(chuàng)建一個到設(shè)備所在目錄的符號鏈接
        if (error)
            goto SysEntryError;

        devtmpfs_create_node(dev); //在/dev下創(chuàng)建設(shè)備文件
    }

    /* Notify clients of device addition.  This call must come
     * after dpm_sysfs_add() and before kobject_uevent().
     */
    if (dev->bus)
        blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
                         BUS_NOTIFY_ADD_DEVICE, dev);

    kobject_uevent(&dev->kobj, KOBJ_ADD); //發(fā)送設(shè)備ADD事件
    bus_probe_device(dev);  //嘗試將device綁定到device_driver
    if (parent)  //如果指定了parent,將設(shè)備添加到parent的孩子鏈表中
        klist_add_tail(&dev->p->knode_parent,
                   &parent->p->klist_children);

    if (dev->class) {  //如果設(shè)置了class,將設(shè)備添加到類的設(shè)備鏈表
        mutex_lock(&dev->class->p->mutex);
        /* tie the class to the device */
        klist_add_tail(&dev->knode_class,
                   &dev->class->p->klist_devices);

        /* notify any interfaces that the device is here */
        list_for_each_entry(class_intf,  //調(diào)用device所屬的class中所有class_interface的add_dev
                    &dev->class->p->interfaces, node)
            if (class_intf->add_dev)
                class_intf->add_dev(dev, class_intf);
        mutex_unlock(&dev->class->p->mutex);
    }
done:
    put_device(dev);
    return error;
 SysEntryError:
    if (MAJOR(dev->devt))
        device_remove_file(dev, &dev_attr_dev);
 DevAttrError:
    device_pm_remove(dev);
    dpm_sysfs_remove(dev);
 DPMError:
    bus_remove_device(dev);
 BusError:
    device_remove_attrs(dev);
 AttrsError:
    device_remove_class_symlinks(dev);
 SymlinkError:
    device_remove_file(dev, &dev_attr_uevent);
 attrError:
    kobject_uevent(&dev->kobj, KOBJ_REMOVE);
    kobject_del(&dev->kobj);
 Error:
    cleanup_device_parent(dev);
    put_device(parent);
name_error:
    kfree(dev->p);
    dev->p = NULL;
    goto done;
}
EXPORT_SYMBOL_GPL(device_add);

設(shè)備添加到系統(tǒng)主要流程都在device_add函數(shù)實現(xiàn),上面代碼的注釋基本把主要函數(shù)的作用進行了描述。值得關(guān)注的一個函數(shù)便是bus_probe_device,該函數(shù)完成將設(shè)備綁定到驅(qū)動的動作。

void bus_probe_device(struct device *dev)
{
    struct bus_type *bus = dev->bus;
    struct subsys_interface *sif;

    if (!bus)
        return;

    if (bus->p->drivers_autoprobe) //如果bus允許自動探測
        device_initial_probe(dev); //主要功能

    mutex_lock(&bus->p->mutex);
    list_for_each_entry(sif, &bus->p->interfaces, node) //將設(shè)備綁定到接口
        if (sif->add_dev)
            sif->add_dev(dev, sif);
    mutex_unlock(&bus->p->mutex);
}

void device_initial_probe(struct device *dev)
{
    __device_attach(dev, true);
}

static int __device_attach(struct device *dev, bool allow_async)
{
    int ret = 0;

    device_lock(dev);
    if (dev->driver) {  //指定了device所要綁定的driver
        if (klist_node_attached(&dev->p->knode_driver)) { //檢查knode_driver是否綁定到鏈表
            ret = 1;
            goto out_unlock;
        }
        ret = device_bind_driver(dev); //綁定,修改相應鏈表
        if (ret == 0)
            ret = 1;
        else {
            dev->driver = NULL;
            ret = 0;
        }
    } else {  //沒有指定device要綁定的driver
        struct device_attach_data data = {
            .dev = dev,
            .check_async = allow_async,
            .want_async = false,
        };

        if (dev->parent)
            pm_runtime_get_sync(dev->parent);
        //遍歷bus中所有驅(qū)動,嘗試attach
        ret = bus_for_each_drv(dev->bus, NULL, &data,
                    __device_attach_driver);
        if (!ret && allow_async && data.have_async) {
            /*
             * If we could not find appropriate driver
             * synchronously and we are allowed to do
             * async probes and there are drivers that
             * want to probe asynchronously, we'll
             * try them.
             */
            dev_dbg(dev, "scheduling asynchronous probe\n");
            get_device(dev);
            async_schedule(__device_attach_async_helper, dev);
        } else {
            pm_request_idle(dev);
        }

        if (dev->parent)
            pm_runtime_put(dev->parent);
    }
out_unlock:
    device_unlock(dev);
    return ret;
}

通過上面3個函數(shù)的追蹤,__device_attach函數(shù)遍歷bus所有的驅(qū)動,嘗試執(zhí)行attach,具體調(diào)用__device_attach_driver函數(shù)。

static int __device_attach_driver(struct device_driver *drv, void *_data)
{
    struct device_attach_data *data = _data;
    struct device *dev = data->dev;
    bool async_allowed;

    /*
     * Check if device has already been claimed. This may
     * happen with driver loading, device discovery/registration,
     * and deferred probe processing happens all at once with
     * multiple threads.
     */
    if (dev->driver) 
        return -EBUSY;

    if (!driver_match_device(drv, dev))  //調(diào)用bus的match函數(shù),測試是否匹配
        return 0;
    //進一步probe設(shè)備,需要設(shè)備已經(jīng)注冊
    async_allowed = driver_allows_async_probing(drv);

    if (async_allowed)
        data->have_async = true;
    //如果允許異步探測,則先返回
    if (data->check_async && async_allowed != data->want_async)
        return 0;

    return driver_probe_device(drv, dev);
}

int driver_probe_device(struct device_driver *drv, struct device *dev)
{
    int ret = 0;

    if (!device_is_registered(dev)) //檢查device是否register
        return -ENODEV;

    pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
         drv->bus->name, __func__, dev_name(dev), drv->name);

    if (dev->parent)
        pm_runtime_get_sync(dev->parent);

    pm_runtime_barrier(dev);
    ret = really_probe(dev, drv); //真正執(zhí)行探測
    pm_request_idle(dev);

    if (dev->parent)
        pm_runtime_put(dev->parent);

    return ret;
}

從上面兩個函數(shù)來看,真正執(zhí)行probe的函數(shù)是really_probe。

//返回1表示成功,返回0表示中間步驟出現(xiàn)異常,已回滾所有操作。
static int really_probe(struct device *dev, struct device_driver *drv)
{
    int ret = 0;
    int local_trigger_count = atomic_read(&deferred_trigger_count);

    atomic_inc(&probe_count);
    pr_debug("bus: '%s': %s: probing driver %s with device %s\n",
         drv->bus->name, __func__, drv->name, dev_name(dev));
    WARN_ON(!list_empty(&dev->devres_head));

    dev->driver = drv; //將設(shè)備的driver指向當前驅(qū)動

    /* If using pinctrl, bind pins now before probing */
    ret = pinctrl_bind_pins(dev);
    if (ret)
        goto probe_failed;

    if (driver_sysfs_add(dev)) {  //在sysfs驅(qū)動目錄中創(chuàng)建指向設(shè)備的符號鏈接,同時在設(shè)備目錄中創(chuàng)建指向驅(qū)動的符號鏈接
        printk(KERN_ERR "%s: driver_sysfs_add(%s) failed\n",
            __func__, dev_name(dev));
        goto probe_failed;
    }

    if (dev->pm_domain && dev->pm_domain->activate) {
        ret = dev->pm_domain->activate(dev);
        if (ret)
            goto probe_failed;
    }

    /*
     * Ensure devices are listed in devices_kset in correct order
     * It's important to move Dev to the end of devices_kset before
     * calling .probe, because it could be recursive and parent Dev
     * should always go first
     */
    devices_kset_move_last(dev);

    if (dev->bus->probe) {
        ret = dev->bus->probe(dev); //優(yōu)先調(diào)用bus_type中的probe方法
        if (ret)
            goto probe_failed;
    } else if (drv->probe) {
        ret = drv->probe(dev);  //其次,調(diào)用driver中的probe方法
        if (ret)
            goto probe_failed;
    }

    pinctrl_init_done(dev);

    if (dev->pm_domain && dev->pm_domain->sync)
        dev->pm_domain->sync(dev);

    driver_bound(dev); //將設(shè)備鏈入驅(qū)動的設(shè)備鏈表
    ret = 1;
    pr_debug("bus: '%s': %s: bound device %s to driver %s\n",
         drv->bus->name, __func__, dev_name(dev), drv->name);
    goto done;

probe_failed:  //探測失敗, 回滾操作
    devres_release_all(dev);
    driver_sysfs_remove(dev);
    dev->driver = NULL;
    dev_set_drvdata(dev, NULL);
    if (dev->pm_domain && dev->pm_domain->dismiss)
        dev->pm_domain->dismiss(dev);

    switch (ret) {
    case -EPROBE_DEFER:
        /* Driver requested deferred probing */
        dev_dbg(dev, "Driver %s requests probe deferral\n", drv->name);
        driver_deferred_probe_add(dev);
        /* Did a trigger occur while probing? Need to re-trigger if yes */
        if (local_trigger_count != atomic_read(&deferred_trigger_count))
            driver_deferred_probe_trigger();
        break;
    case -ENODEV:
    case -ENXIO:
        pr_debug("%s: probe of %s rejects match %d\n",
             drv->name, dev_name(dev), ret);
        break;
    default:
        /* driver matched but the probe failed */
        printk(KERN_WARNING
               "%s: probe of %s failed with error %d\n",
               drv->name, dev_name(dev), ret);
    }
    /*
     * Ignore errors returned by ->probe so that the next driver can try
     * its luck.
     */
    ret = 0;
done:
    atomic_dec(&probe_count);
    wake_up(&probe_waitqueue);
    return ret;
}

到此,設(shè)備添加到系統(tǒng)的主要流程便基本清楚,不再往下跟蹤。

驅(qū)動

驅(qū)動數(shù)據(jù)結(jié)構(gòu)

struct device_driver {
    const char      *name;     //driver名稱
    struct bus_type     *bus;  //driver所屬的bus_type

    struct module       *owner;
    const char      *mod_name;  /* used for built-in modules */

    bool suppress_bind_attrs;   /* disables bind/unbind via sysfs */
    enum probe_type probe_type;

    const struct of_device_id   *of_match_table;
    const struct acpi_device_id *acpi_match_table;

    int (*probe) (struct device *dev);  //在device綁定到driver之前,對device進行初始化
    int (*remove) (struct device *dev); //在device解綁到driver時,回調(diào)
    void (*shutdown) (struct device *dev);
    int (*suspend) (struct device *dev, pm_message_t state);
    int (*resume) (struct device *dev);
    const struct attribute_group **groups; //driver的屬性

    const struct dev_pm_ops *pm; //電源相關(guān)

    struct driver_private *p;  //driver私有結(jié)構(gòu)
};

struct driver_private {
    struct kobject kobj;
    struct klist klist_devices;   //driver所支持的device鏈表
    struct klist_node knode_bus;  //鏈入bus_type的驅(qū)動鏈表中
    struct module_kobject *mkobj;
    struct device_driver *driver;  //指向driver
};

device_driver結(jié)構(gòu)中,bus域指向驅(qū)動所屬的總線類型,knode_bus域用于鏈入總線類型的驅(qū)動鏈表。driver_private結(jié)構(gòu)中的klist_devices域用于鏈接所有綁定到本驅(qū)動的設(shè)備。

驅(qū)動注冊與反注冊

驅(qū)動在加載時,需要將其注冊到總線類型,調(diào)用driver_register實現(xiàn):

int driver_register(struct device_driver *drv)
{
    int ret;
    struct device_driver *other;

    BUG_ON(!drv->bus->p); //確保bus已經(jīng)注冊到驅(qū)動模型中
    //如果bus_type和driver都實現(xiàn)了同一個回調(diào),優(yōu)先使用bus_type的回調(diào)函數(shù),打印告警信息
    if ((drv->bus->probe && drv->probe) ||
        (drv->bus->remove && drv->remove) ||
        (drv->bus->shutdown && drv->shutdown))
        printk(KERN_WARNING "Driver '%s' needs updating - please use "
            "bus_type methods\n", drv->name);

    other = driver_find(drv->name, drv->bus); //根據(jù)名字查找驅(qū)動
    if (other) {
        printk(KERN_ERR "Error: Driver '%s' is already registered, "
            "aborting...\n", drv->name);
        return -EBUSY;
    }

    ret = bus_add_driver(drv); //將driver添加到bus
    if (ret)
        return ret;
    ret = driver_add_groups(drv, drv->groups); //創(chuàng)建driver屬性文件
    if (ret) {
        bus_remove_driver(drv);
        return ret;
    }
    kobject_uevent(&drv->p->kobj, KOBJ_ADD); //發(fā)送ADD事件到用戶空間

    return ret;
}
EXPORT_SYMBOL_GPL(driver_register);

添加driver到bus_type,由bus_add_driver實現(xiàn):

int bus_add_driver(struct device_driver *drv)
{
    struct bus_type *bus;
    struct driver_private *priv;
    int error = 0;

    bus = bus_get(drv->bus);
    if (!bus)
        return -EINVAL;

    pr_debug("bus: '%s': add driver %s\n", bus->name, drv->name);

    priv = kzalloc(sizeof(*priv), GFP_KERNEL);  //分配driver_private結(jié)構(gòu)空間
    if (!priv) {
        error = -ENOMEM;
        goto out_put_bus;
    }
    klist_init(&priv->klist_devices, NULL, NULL); //初始化driver設(shè)備鏈表
    priv->driver = drv; //關(guān)聯(lián)device_driver和driver_private
    drv->p = priv;
    priv->kobj.kset = bus->p->drivers_kset; //driver_private中的kobj的kset域指向subsys中的drivers_kset
    error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL,  //添加driver到sysfs
                     "%s", drv->name);
    if (error)
        goto out_unregister;

    klist_add_tail(&priv->knode_bus, &bus->p->klist_drivers); //添加driver到bus的驅(qū)動鏈表中
    if (drv->bus->p->drivers_autoprobe) {  //自動探測
        if (driver_allows_async_probing(drv)) {  //允許異步執(zhí)行probe
            pr_debug("bus: '%s': probing driver %s asynchronously\n",
                drv->bus->name, drv->name);
            async_schedule(driver_attach_async, drv); //異步probe
        } else {
            error = driver_attach(drv);  //同步probe
            if (error)
                goto out_unregister;
        }
    }
    module_add_driver(drv->owner, drv);  //驅(qū)動實現(xiàn)的模塊

    error = driver_create_file(drv, &driver_attr_uevent);  //在driver中添加uevent屬性文件
    if (error) {
        printk(KERN_ERR "%s: uevent attr (%s) failed\n",
            __func__, drv->name);
    }
    error = driver_add_groups(drv, bus->drv_groups);  //添加driver的屬性文件
    if (error) {
        /* How the hell do we get out of this pickle? Give up */
        printk(KERN_ERR "%s: driver_create_groups(%s) failed\n",
            __func__, drv->name);
    }

    if (!drv->suppress_bind_attrs) {
        error = add_bind_files(drv);  //在driver目錄添加的bind和unbind兩個屬性文件
        if (error) {
            /* Ditto */
            printk(KERN_ERR "%s: add_bind_files(%s) failed\n",
                __func__, drv->name);
        }
    }

    return 0;

out_unregister:
    kobject_put(&priv->kobj);
    kfree(drv->p);
    drv->p = NULL;
out_put_bus:
    bus_put(bus);
    return error;
}

bus_add_driver函數(shù)完成驅(qū)動添加到總線類型,當驅(qū)動添加完成后,如果總線類型設(shè)置了允許自動探測標志drivers_autoprobe,便可以根據(jù)是否允許異步探測調(diào)用driver_attach_async或driver_attach,driver_attach_async也是調(diào)用driver_attach:

int driver_attach(struct device_driver *drv)
{
    return bus_for_each_dev(drv->bus, NULL, drv, __driver_attach);
}
EXPORT_SYMBOL_GPL(driver_attach);

static int __driver_attach(struct device *dev, void *data)
{
    struct device_driver *drv = data;

    /*
     * Lock device and try to bind to it. We drop the error
     * here and always return 0, because we need to keep trying
     * to bind to devices and some drivers will return an error
     * simply if it didn't support the device.
     *
     * driver_probe_device() will spit a warning if there
     * is an error.
     */

    if (!driver_match_device(drv, dev)) //調(diào)用bus_type.match
        return 0;

    if (dev->parent)    /* Needed for USB */
        device_lock(dev->parent);
    device_lock(dev);
    if (!dev->driver)
        driver_probe_device(drv, dev); //完成probe的主要函數(shù)
    device_unlock(dev);
    if (dev->parent)
        device_unlock(dev->parent);

    return 0;
}

int driver_probe_device(struct device_driver *drv, struct device *dev)
{
    int ret = 0;

    if (!device_is_registered(dev)) //檢查device是否register
        return -ENODEV;

    pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
         drv->bus->name, __func__, dev_name(dev), drv->name);

    if (dev->parent)
        pm_runtime_get_sync(dev->parent);

    pm_runtime_barrier(dev);
    ret = really_probe(dev, drv); //真正執(zhí)行探測
    pm_request_idle(dev);

    if (dev->parent)
        pm_runtime_put(dev->parent);

    return ret;
}

根據(jù)上面3個函數(shù),最終仍然是調(diào)用前面描述過的really_probe函數(shù)完成最后的探測。

到這里驅(qū)動注冊完成,結(jié)合之前的設(shè)備注冊流程,無論是驅(qū)動注冊或是設(shè)備注冊,只要總線類型設(shè)置了自動探測標志位,這兩個流程都會執(zhí)行探測。所以設(shè)備發(fā)現(xiàn)與驅(qū)動的加載順序已經(jīng)不再重要,也是通過這種雙向探測方式,Linux內(nèi)核支持設(shè)備的熱拔插機制。

驅(qū)動卸載時,需要調(diào)用driver_unregister函數(shù),使driver脫離總線類型:

void driver_unregister(struct device_driver *drv)
{
    if (!drv || !drv->p) {
        WARN(1, "Unexpected driver unregister!\n");
        return;
    }
    driver_remove_groups(drv, drv->groups); //刪除驅(qū)動的屬性文件
    bus_remove_driver(drv);                 //從總線類型中移除驅(qū)動
}
EXPORT_SYMBOL_GPL(driver_unregister);

void bus_remove_driver(struct device_driver *drv)
{
    if (!drv->bus)
        return;

    if (!drv->suppress_bind_attrs)
        remove_bind_files(drv);   //刪除驅(qū)動目錄下bind和unbind文件
    driver_remove_groups(drv, drv->bus->drv_groups); //刪除總線類型的驅(qū)動屬性文件
    driver_remove_file(drv, &driver_attr_uevent);    //刪除驅(qū)動目錄下uevent文件
    klist_remove(&drv->p->knode_bus); //從總線類型的驅(qū)動鏈表中移除驅(qū)動
    pr_debug("bus: '%s': remove driver %s\n", drv->bus->name, drv->name);
    driver_detach(drv);  //驅(qū)動與所有綁定的設(shè)備進行解綁
    module_remove_driver(drv);  //驅(qū)動實現(xiàn)的模塊
    kobject_put(&drv->p->kobj); //減少引用計數(shù)
    bus_put(drv->bus);
}

類數(shù)據(jù)結(jié)構(gòu)

struct class {
    const char      *name;       //類名稱
    struct module       *owner;  //指向?qū)崿F(xiàn)這個類的模塊的指針

    struct class_attribute      *class_attrs;     //類公共屬性
    const struct attribute_group    **dev_groups; //歸屬與該類的設(shè)備的默認屬性
    struct kobject          *dev_kobj;            //類鏈入sysfs的kobject

    int (*dev_uevent)(struct device *dev, struct kobj_uevent_env *env); //發(fā)送事件前,設(shè)置類的特定環(huán)境變量
    char *(*devnode)(struct device *dev, umode_t *mode); //創(chuàng)建設(shè)備時,返回設(shè)備名稱

    void (*class_release)(struct class *class); //類釋放時回調(diào)
    void (*dev_release)(struct device *dev);    //設(shè)備釋放時回調(diào)

    int (*suspend)(struct device *dev, pm_message_t state); //設(shè)備進入睡眠狀態(tài)時,回調(diào)
    int (*resume)(struct device *dev);                      //設(shè)備被喚醒時,回調(diào)

    const struct kobj_ns_type_operations *ns_type;  //sysfs支持命名空間
    const void *(*namespace)(struct device *dev);   //返回設(shè)備所在的命名空間

    const struct dev_pm_ops *pm;  //電源相關(guān)

    struct subsys_private *p;     //類所屬的子系統(tǒng)私有數(shù)據(jù)結(jié)構(gòu)
};

類的私有數(shù)據(jù)類型與總線類型的私有數(shù)據(jù)類型都是subsys_private,這里將不再重復描述。

類注冊與反注冊

子系統(tǒng)需要使用類時,需要調(diào)用class_register函數(shù)向總線類型注冊類:

#define class_register(class)           ({                          static struct lock_class_key __key;     __class_register(class, &__key);    })

int __class_register(struct class *cls, struct lock_class_key *key)
{
    struct subsys_private *cp;
    int error;

    pr_debug("device class '%s': registering\n", cls->name);

    cp = kzalloc(sizeof(*cp), GFP_KERNEL); //分配私有數(shù)據(jù)空間
    if (!cp)
        return -ENOMEM;
    klist_init(&cp->klist_devices, klist_class_dev_get, klist_class_dev_put); //初始化該class的device鏈表
    INIT_LIST_HEAD(&cp->interfaces);  //初始化接口鏈表
    kset_init(&cp->glue_dirs);
    __mutex_init(&cp->mutex, "subsys mutex", key);
    error = kobject_set_name(&cp->subsys.kobj, "%s", cls->name); //將在/sys/class/目錄下顯示該名稱
    if (error) {
        kfree(cp);
        return error;
    }

    /* set the default /sys/dev directory for devices of this class */
    if (!cls->dev_kobj)
        cls->dev_kobj = sysfs_dev_char_kobj;

#if defined(CONFIG_BLOCK)
    /* let the block class directory show up in the root of sysfs */
    if (!sysfs_deprecated || cls != &block_class)
        cp->subsys.kobj.kset = class_kset;
#else
    cp->subsys.kobj.kset = class_kset;  // 全局變量class_kset指的是 /sys/class/
#endif
    cp->subsys.kobj.ktype = &class_ktype;
    cp->class = cls;  //class與subsys_private關(guān)聯(lián)
    cls->p = cp;

    error = kset_register(&cp->subsys);  //在/sys/class/目錄下創(chuàng)建該類對應的目錄
    if (error) {
        kfree(cp);
        return error;
    }
    error = add_class_attrs(class_get(cls)); //在/sys/class/xxx/目錄下創(chuàng)建類屬性文件
    class_put(cls);
    return error;
}
EXPORT_SYMBOL_GPL(__class_register);

類的注冊比較簡單,注釋已經(jīng)比較詳細。當子系統(tǒng)需要卸載類時,需要調(diào)用class_register函數(shù):

void class_unregister(struct class *cls)
{
    pr_debug("device class '%s': unregistering\n", cls->name);
    remove_class_attrs(cls);            //刪除/sys/class/xxx/目錄下的類屬性文件
    kset_unregister(&cls->p->subsys);   //刪除/sys/class/目錄
}

接口

接口數(shù)據(jù)結(jié)構(gòu)

struct class_interface {
    struct list_head    node;    //鏈入class中
    struct class        *class;  //指向所屬的class 
    //在接口被添加或者設(shè)備被添加到接口所在的類時,從接口中添加或刪除設(shè)備
    int (*add_dev)      (struct device *, struct class_interface *);
    void (*remove_dev)  (struct device *, struct class_interface *);
};

接口注冊與反注冊

向類中注冊接口,需要調(diào)用class_interface_register函數(shù)完成:

int class_interface_register(struct class_interface *class_intf)
{
    struct class *parent;
    struct class_dev_iter iter;
    struct device *dev;

    if (!class_intf || !class_intf->class)  //確保class和class_interface都存在
        return -ENODEV;

    parent = class_get(class_intf->class); //增加引用計數(shù),并返回接口所屬的class
    if (!parent)
        return -EINVAL;

    mutex_lock(&parent->p->mutex);
    list_add_tail(&class_intf->node, &parent->p->interfaces); //將class_interface添加到class的接口鏈表
    if (class_intf->add_dev) {  //如果接口設(shè)置了add_dev方法,對該class的所有device調(diào)用
        class_dev_iter_init(&iter, parent, NULL, NULL);
        while ((dev = class_dev_iter_next(&iter)))
            class_intf->add_dev(dev, class_intf);  //接口方法作用于設(shè)備
        class_dev_iter_exit(&iter);
    }
    mutex_unlock(&parent->p->mutex);

    return 0;
}

從類中刪除接口,需要調(diào)用class_interface_unregister函數(shù)完成:

void class_interface_unregister(struct class_interface *class_intf)
{
    struct class *parent = class_intf->class;
    struct class_dev_iter iter;
    struct device *dev;

    if (!parent)
        return;

    mutex_lock(&parent->p->mutex);
    list_del_init(&class_intf->node); //將class_interface從class的接口鏈表中刪除
    if (class_intf->remove_dev) { //如果接口設(shè)置了remove_dev方法,對該class的所有device調(diào)用
        class_dev_iter_init(&iter, parent, NULL, NULL);
        while ((dev = class_dev_iter_next(&iter)))
            class_intf->remove_dev(dev, class_intf);  //接口方法作用于設(shè)備
        class_dev_iter_exit(&iter);
    }
    mutex_unlock(&parent->p->mutex);

    class_put(parent);
}

基于設(shè)備驅(qū)動模型實現(xiàn)子系統(tǒng)

Linux設(shè)備驅(qū)動模型已經(jīng)將每種對象的關(guān)系,sysfs的呈現(xiàn)方式已經(jīng)實現(xiàn)了。實現(xiàn)子系統(tǒng)只需要定義業(yè)務(wù)自身的總線類型, 設(shè)備, 驅(qū)動, 類, 接口分別”繼承”bus_type, device, driver, class, class_interface。并根據(jù)具體業(yè)務(wù)實現(xiàn)各個結(jié)構(gòu)規(guī)定的回調(diào)函數(shù)。最后調(diào)用上述的注冊函數(shù)添加到系統(tǒng),便完成子系統(tǒng)的開發(fā)。


SCSI子系統(tǒng)之概述

Linux SCSI子系統(tǒng)的分層架構(gòu):

這里寫圖片描述

  • 低層:代表與SCSI的物理接口的實際驅(qū)動器,例如各個廠商為其特定的主機適配器(Host Bus Adapter, HBA)開發(fā)的驅(qū)動,低層驅(qū)動主要作用是發(fā)現(xiàn)連接到主機適配器的scsi設(shè)備,在內(nèi)存中構(gòu)建scsi子系統(tǒng)所需的數(shù)據(jù)結(jié)構(gòu),并提供消息傳遞接口,將scsi命令的接受與發(fā)送解釋為主機適配器的操作。

  • 高層: 代表各種scsi設(shè)備類型的驅(qū)動,如scsi磁盤驅(qū)動,scsi磁帶驅(qū)動,高層驅(qū)動認領(lǐng)低層驅(qū)動發(fā)現(xiàn)的scsi設(shè)備,為這些設(shè)備分配名稱,將對設(shè)備的IO轉(zhuǎn)換為scsi命令,交由低層驅(qū)動處理。

  • 中層:包含scsi棧的公共服務(wù)函數(shù)。高層和低層通過調(diào)用中層的函數(shù)完成其功能,而中層在執(zhí)行過程中,也需要調(diào)用高層和低層注冊的回調(diào)函數(shù)做一些個性化處理。

Linux SCSI模型

這里寫圖片描述

Linux SCSI模型是內(nèi)核的抽象,主機適配器連接主機IO總線(如PCI總線)和存儲IO總線(如SCSI總線)。一臺計算機可以有多個主機適配器,而主機適配器可以控制一條或多條SCSI總線,一條總線可以有多個目標節(jié)點與之相連,并且一個目標節(jié)點可以有多個邏輯單元。

在Linux SCSI子系統(tǒng)中,內(nèi)核中的目標節(jié)點(target)對應SCSI磁盤,SCSI磁盤中可以有多個邏輯單元,統(tǒng)一由磁盤控制器控制,這些邏輯單元才是真正作為IO終點的存儲設(shè)備,內(nèi)核用設(shè)備(device)對邏輯單元進行抽象;內(nèi)核中的Host對應主機適配器(物理的HBA/RAID卡,虛擬的iscsi target)

內(nèi)核使用四元組 來唯一標識一個scsi的邏輯單元,在sysfs中查看sda磁盤<2:0:0:0>顯示如下:

root@ubuntu16:/home/comet/Costor/bin# ls /sys/bus/scsi/devices/2\:0\:0\:0/block/sda/
alignment_offset  device             events_poll_msecs  integrity  removable  sda5    subsystem
bdi               discard_alignment  ext_range          power      ro         size    trace
capability        events             holders            queue      sda1       slaves  uevent
dev               events_async       inflight           range      sda2       stat
root@ubuntu16:/home/comet/Costor/bin# cat /sys/bus/scsi/devices/2\:0\:0\:0/block/sda/dev
8:0
root@ubuntu16:/home/comet/Costor/bin# ll /dev/sda
brw-rw---- 1 root disk 8, 0 Sep 19 11:36 /dev/sda
  • host: 主機適配器的唯一編號。

  • channel: 主機適配器中scsi通道編號,由主機適配器固件維護。

  • id: 目標節(jié)點唯一標識符。

  • lun: 目標節(jié)點內(nèi)邏輯單元編號。

SCSI命令

SCSI 命令是在 Command Descriptor Block (CDB) 中定義的。CDB 包含了用來定義要執(zhí)行的特定操作的操作代碼,以及大量特定于操作的參數(shù)。

命令用途
Test unit ready查詢設(shè)備是否已經(jīng)準備好進行傳輸
Inquiry請求設(shè)備基本信息
Request sense請求之前命令的錯誤信息
Read capacity請求存儲容量信息
Read從設(shè)備讀取數(shù)據(jù)
Write向設(shè)備寫入數(shù)據(jù)
Mode sense請求模式頁面(設(shè)備參數(shù))
Mode select在模式頁面配置設(shè)備參數(shù)

借助大約 60 種可用命令,SCSI 可適用于許多設(shè)備(包括隨機存取設(shè)備,比如磁盤和像磁帶這樣的順序存儲設(shè)備)。SCSI 也提供了專門的命令以訪問箱體服務(wù)(比如存儲箱體內(nèi)部當前的傳感和溫度)。

核心數(shù)據(jù)結(jié)構(gòu)

主機適配器模板scsi_host_template

主機適配器模板是相同型號主機適配器的公共內(nèi)容,包括請求隊列深度,SCSI命令處理回調(diào)函數(shù),錯誤處理恢復函數(shù)。分配主機適配器結(jié)構(gòu)時,需要使用主機適配器模板來賦值。在編寫SCSI低層驅(qū)動時,第一步便是定義模板scsi_host_template,之后才能有模板生成主機適配器。

struct scsi_host_template {
    struct module *module;  //指向使用該模板實現(xiàn)的scsi_host,低層驅(qū)動模塊。
    const char *name;       //主機適配器名稱

    int (* detect)(struct scsi_host_template *);
    int (* release)(struct Scsi_Host *);

    const char *(* info)(struct Scsi_Host *); //返回HBA相關(guān)信息,可選實現(xiàn)

    int (* ioctl)(struct scsi_device *dev, int cmd, void __user *arg); //用戶空間ioctl函數(shù)的實現(xiàn),可選實現(xiàn)


#ifdef CONFIG_COMPAT
    //通過該函數(shù),支持32位系統(tǒng)的用戶態(tài)ioctl函數(shù)
    int (* compat_ioctl)(struct scsi_device *dev, int cmd, void __user *arg);
#endif

    //將scsi命令放進低層驅(qū)動的隊列,由中間層調(diào)用,必須實現(xiàn)
    int (* queuecommand)(struct Scsi_Host *, struct scsi_cmnd *);

    //以下5個函數(shù)是錯誤處理回調(diào)函數(shù),由中間層按照嚴重程度調(diào)用
    int (* eh_abort_handler)(struct scsi_cmnd *);        //Abort
    int (* eh_device_reset_handler)(struct scsi_cmnd *); //Device Reset
    int (* eh_target_reset_handler)(struct scsi_cmnd *); //Target Reset
    int (* eh_bus_reset_handler)(struct scsi_cmnd *);    //Bus Reset
    int (* eh_host_reset_handler)(struct scsi_cmnd *);   //Host Reset

    //當掃描到新磁盤時調(diào)用,中間層回調(diào)這個函數(shù)中可以分配和初始化低層驅(qū)動所需要的結(jié)構(gòu)
    int (* slave_alloc)(struct scsi_device *)

//在設(shè)備受到INQUIRY命令后,執(zhí)行相關(guān)的配置操作
    int (* slave_configure)(struct scsi_device *);

    //在scsi設(shè)備銷毀之前調(diào)用,中間層回調(diào)用于釋放slave_alloc分配的私有數(shù)據(jù)
    void (* slave_destroy)(struct scsi_device *);

    //當發(fā)現(xiàn)新的target,中間層調(diào)用,用戶分配target私有數(shù)據(jù)
    int (* target_alloc)(struct scsi_target *);

    //在target被銷毀之前,中間層調(diào)用,低層驅(qū)動實現(xiàn),用于釋放target_alloc分配的數(shù)據(jù)
    void (* target_destroy)(struct scsi_target *);

    //需要自定義掃描target邏輯時,中間層循環(huán)檢查返回值,直到該函數(shù)返回1,表示掃描完成
    int (* scan_finished)(struct Scsi_Host *, unsigned long);

    //需要自定義掃描target邏輯時,掃描開始前回調(diào)
    void (* scan_start)(struct Scsi_Host *);

    //改變主機適配器的隊列深度,返回設(shè)置的隊列深度
    int (* change_queue_depth)(struct scsi_device *, int);

    //返回磁盤的BIOS參數(shù),如size, device, list (heads, sectors, cylinders)
    int (* bios_param)(struct scsi_device *, struct block_device *,
            sector_t, int []);

    void (*unlock_native_capacity)(struct scsi_device *);

    //在procfs中的讀寫操作回調(diào)
    int (*show_info)(struct seq_file *, struct Scsi_Host *);
    int (*write_info)(struct Scsi_Host *, char *, int);

    //中間層發(fā)現(xiàn)scsi命令超時回調(diào)
    enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *);

    //通過sysfs屬性reset主機適配器時,回調(diào)
    int (*host_reset)(struct Scsi_Host *shost, int reset_type);
#define SCSI_ADAPTER_RESET  1
#define SCSI_FIRMWARE_RESET 2

    const char *proc_name; //在proc文件系統(tǒng)的名稱

    struct proc_dir_entry *proc_dir;

    int can_queue; //主機適配器能同時接受的命令數(shù)

    int this_id;

    /*
     * This determines the degree to which the host adapter is capable
     * of scatter-gather.
     */  //聚散列表的參數(shù)
    unsigned short sg_tablesize;
    unsigned short sg_prot_tablesize;

    /*
     * Set this if the host adapter has limitations beside segment count.
     */ //單個scsi命令能夠訪問的扇區(qū)最大數(shù)量
    unsigned int max_sectors;

    /*
     * DMA scatter gather segment boundary limit. A segment crossing this
     * boundary will be split in two.
     */
    unsigned long dma_boundary; //DMA聚散段邊界值,超過該值將被切割成兩個

#define SCSI_DEFAULT_MAX_SECTORS    1024

    short cmd_per_lun;

    /*
     * present contains counter indicating how many boards of this
     * type were found when we did the scan.
     */
    unsigned char present;

    /* If use block layer to manage tags, this is tag allocation policy */
    int tag_alloc_policy;

    /*
     * Track QUEUE_FULL events and reduce queue depth on demand.
     */
    unsigned track_queue_depth:1;

    /*
     * This specifies the mode that a LLD supports.
     */
    unsigned supported_mode:2; //低層驅(qū)動支持的模式(initiator或target)

    /*
     * True if this host adapter uses unchecked DMA onto an ISA bus.
     */
    unsigned unchecked_isa_dma:1;

    unsigned use_clustering:1;

    /*
     * True for emulated SCSI host adapters (e.g. ATAPI).
     */
    unsigned emulated:1;

    /*
     * True if the low-level driver performs its own reset-settle delays.
     */
    unsigned skip_settle_delay:1;

    /* True if the controller does not support WRITE SAME */
    unsigned no_write_same:1;

    /*
     * True if asynchronous aborts are not supported
     */
    unsigned no_async_abort:1;

    /*
     * Countdown for host blocking with no commands outstanding.
     */
    unsigned int max_host_blocked; //主機適配器發(fā)送隊列的低閥值,允許累計多個命令同時派發(fā)

#define SCSI_DEFAULT_HOST_BLOCKED   7

    /*
     * Pointer to the sysfs class properties for this host, NULL terminated.
     */
    struct device_attribute **shost_attrs; //主機適配器類屬性

    /*
     * Pointer to the SCSI device properties for this host, NULL terminated.
     */
    struct device_attribute **sdev_attrs;  //主機適配器設(shè)備屬性

    struct list_head legacy_hosts;

    u64 vendor_id;

    /*
     * Additional per-command data allocated for the driver.
     */  //scsi 命令緩沖池,scsi命令都是預先分配好的,保存在cmd_pool中
    unsigned int cmd_size;
    struct scsi_host_cmd_pool *cmd_pool;

    /* temporary flag to disable blk-mq I/O path */
    bool disable_blk_mq;  //禁用通用塊層多隊列模式標志
};

主機適配器Scsi_Host

Scsi_Host描述一個SCSI主機適配器,SCSI主機適配器通常是一塊基于PCI總線的擴展卡或是一個SCSI控制器芯片。每個SCSI主機適配器可以存在多個通道,一個通道實際擴展了一條SCSI總線。每個通過可以連接多個SCSI目標節(jié)點,具體連接數(shù)量與SCSI總線帶載能力有關(guān),或者受具體SCSI協(xié)議的限制。 真實的主機總線適配器是接入主機IO總線上(通常是PCI總線),在系統(tǒng)啟動時,會掃描掛載在PCI總線上的設(shè)備,此時會分配主機總線適配器。 
Scsi_Host結(jié)構(gòu)包含內(nèi)嵌通用設(shè)備,將被鏈入SCSI總線類型(scsi_bus_type)的設(shè)備鏈表。

struct Scsi_Host {
    struct list_head    __devices; //設(shè)備鏈表
    struct list_head    __targets; //目標節(jié)點鏈表

    struct scsi_host_cmd_pool *cmd_pool; //scsi命令緩沖池
    spinlock_t      free_list_lock;   //保護free_list
    struct list_head    free_list; /* backup store of cmd structs, scsi命令預先分配的備用命令鏈表 */
    struct list_head    starved_list; //scsi命令的饑餓鏈表

    spinlock_t      default_lock;
    spinlock_t      *host_lock;

    struct mutex        scan_mutex;/* serialize scanning activity */

    struct list_head    eh_cmd_q; //執(zhí)行錯誤的scsi命令的鏈表
    struct task_struct    * ehandler;  /* Error recovery thread. 錯誤恢復線程 */
    struct completion     * eh_action; /* Wait for specific actions on the
                          host. */
    wait_queue_head_t       host_wait; //scsi設(shè)備恢復等待隊列
    struct scsi_host_template *hostt;  //主機適配器模板
    struct scsi_transport_template *transportt; //指向SCSI傳輸層模板

    /*
     * Area to keep a shared tag map (if needed, will be
     * NULL if not).
     */
    union {
        struct blk_queue_tag    *bqt;
        struct blk_mq_tag_set   tag_set; //SCSI支持多隊列時使用
    };
    //已經(jīng)派發(fā)給主機適配器(低層驅(qū)動)的scsi命令數(shù)
    atomic_t host_busy;        /* commands actually active on low-level */
    atomic_t host_blocked;  //阻塞的scsi命令數(shù)

    unsigned int host_failed;      /* commands that failed.
                          protected by host_lock */
    unsigned int host_eh_scheduled;    /* EH scheduled without command */

    unsigned int host_no;  /* Used for IOCTL_GET_IDLUN, /proc/scsi et al. 系統(tǒng)內(nèi)唯一標識 */

    /* next two fields are used to bound the time spent in error handling */
    int eh_deadline;
    unsigned long last_reset; //記錄上次reset時間


    /*
     * These three parameters can be used to allow for wide scsi,
     * and for host adapters that support multiple busses
     * The last two should be set to 1 more than the actual max id
     * or lun (e.g. 8 for SCSI parallel systems).
     */
    unsigned int max_channel; //主機適配器的最大通道編號
    unsigned int max_id;      //主機適配器目標節(jié)點最大編號
    u64 max_lun;              //主機適配器lun最大編號

    unsigned int unique_id;

    /*
     * The maximum length of SCSI commands that this host can accept.
     * Probably 12 for most host adapters, but could be 16 for others.
     * or 260 if the driver supports variable length cdbs.
     * For drivers that don't set this field, a value of 12 is
     * assumed.
     */
    unsigned short max_cmd_len;  //主機適配器可以接受的最長的SCSI命令
    //下面這段在scsi_host_template中也有,由template中的字段賦值
    int this_id;
    int can_queue;
    short cmd_per_lun;
    short unsigned int sg_tablesize;
    short unsigned int sg_prot_tablesize;
    unsigned int max_sectors;
    unsigned long dma_boundary;
    /*
     * In scsi-mq mode, the number of hardware queues supported by the LLD.
     *
     * Note: it is assumed that each hardware queue has a queue depth of
     * can_queue. In other words, the total queue depth per host
     * is nr_hw_queues * can_queue.
     */
    unsigned nr_hw_queues; //在scsi-mq模式中,低層驅(qū)動所支持的硬件隊列的數(shù)量
    /*
     * Used to assign serial numbers to the cmds.
     * Protected by the host lock.
     */
    unsigned long cmd_serial_number;  //指向命令序列號unsigned active_mode:2;           //標識是initiator或target
    unsigned unchecked_isa_dma:1;
    unsigned use_clustering:1;

    /*
     * Host has requested that no further requests come through for the
     * time being.
     */
    unsigned host_self_blocked:1; //表示低層驅(qū)動要求阻塞該主機適配器,此時中間層不會繼續(xù)派發(fā)命令到主機適配器隊列中

    /*
     * Host uses correct SCSI ordering not PC ordering. The bit is
     * set for the minority of drivers whose authors actually read
     * the spec ;).
     */
    unsigned reverse_ordering:1;

    /* Task mgmt function in progress */
    unsigned tmf_in_progress:1;  //任務(wù)管理函數(shù)正在執(zhí)行

    /* Asynchronous scan in progress */
    unsigned async_scan:1;       //異步掃描正在執(zhí)行

    /* Don't resume host in EH */
    unsigned eh_noresume:1;      //在錯誤處理過程不恢復主機適配器

    /* The controller does not support WRITE SAME */
    unsigned no_write_same:1;

    unsigned use_blk_mq:1;       //是否使用SCSI多隊列模式
    unsigned use_cmd_list:1;

    /* Host responded with short (<36 bytes) INQUIRY result */
    unsigned short_inquiry:1;

    /*
     * Optional work queue to be utilized by the transport
     */
    char work_q_name[20];  //被scsi傳輸層使用的工作隊列
    struct workqueue_struct *work_q;

    /*
     * Task management function work queue
     */
    struct workqueue_struct *tmf_work_q; //任務(wù)管理函數(shù)工作隊列

    /* The transport requires the LUN bits NOT to be stored in CDB[1] */
    unsigned no_scsi2_lun_in_cdb:1;

    /*
     * Value host_blocked counts down from
     */
    unsigned int max_host_blocked; //在派發(fā)隊列中累計命令達到這個數(shù)值,才開始喚醒主機適配器

    /* Protection Information */
    unsigned int prot_capabilities;
    unsigned char prot_guard_type;

    /*
     * q used for scsi_tgt msgs, async events or any other requests that
     * need to be processed in userspace
     */
    struct request_queue *uspace_req_q; //需要在用戶空間處理的scsi_tgt消息、異步事件或其他請求的請求隊列

    /* legacy crap */
    unsigned long base;
    unsigned long io_port;   //I/O端口編號
    unsigned char n_io_port;
    unsigned char dma_channel;
    unsigned int  irq;


    enum scsi_host_state shost_state; //狀態(tài)

    /* ldm bits */ //shost_gendev: 內(nèi)嵌通用設(shè)備,SCSI設(shè)備通過這個域鏈入SCSI總線類型(scsi_bus_type)的設(shè)備鏈表
    struct device       shost_gendev, shost_dev;
    //shost_dev: 內(nèi)嵌類設(shè)備, SCSI設(shè)備通過這個域鏈入SCSI主機適配器類型(shost_class)的設(shè)備鏈表
    /*
     * List of hosts per template.
     *
     * This is only for use by scsi_module.c for legacy templates.
     * For these access to it is synchronized implicitly by
     * module_init/module_exit.
     */
    struct list_head sht_legacy_list;

    /*
     * Points to the transport data (if any) which is allocated
     * separately
     */
    void *shost_data; //指向獨立分配的傳輸層數(shù)據(jù),由SCSI傳輸層使用

    /*
     * Points to the physical bus device we'd use to do DMA
     * Needed just in case we have virtual hosts.
     */
    struct device *dma_dev;

    /*
     * We should ensure that this is aligned, both for better performance
     * and also because some compilers (m68k) don't automatically force
     * alignment to a long boundary.
     */ //主機適配器專有數(shù)據(jù)
    unsigned long hostdata[0]  /* Used for storage of host specific stuff */
        __attribute__ ((aligned (sizeof(unsigned long))));
};

目標節(jié)點scsi_target

scsi_target結(jié)構(gòu)中有一個內(nèi)嵌驅(qū)動模型設(shè)備,被鏈入SCSI總線類型scsi_bus_type的設(shè)備鏈表。

struct scsi_target {
    struct scsi_device  *starget_sdev_user; //指向正在進行I/O的scsi設(shè)備,沒有IO則指向NULL
    struct list_head    siblings;  //鏈入主機適配器target鏈表中
    struct list_head    devices;   //屬于該target的device鏈表
    struct device       dev;       //通用設(shè)備,用于加入設(shè)備驅(qū)動模型
    struct kref     reap_ref; /* last put renders target invisible 本結(jié)構(gòu)的引用計數(shù) */
    unsigned int        channel;   //該target所在的channel號
    unsigned int        id; /* target id ... replace
                     * scsi_device.id eventually */
    unsigned int        create:1; /* signal that it needs to be added */
    unsigned int        single_lun:1;   /* Indicates we should only
                         * allow I/O to one of the luns
                         * for the device at a time. */
    unsigned int        pdt_1f_for_no_lun:1;    /* PDT = 0x1f
                         * means no lun present. */
    unsigned int        no_report_luns:1;   /* Don't use
                         * REPORT LUNS for scanning. */
    unsigned int        expecting_lun_change:1; /* A device has reported
                         * a 3F/0E UA, other devices on
                         * the same target will also. */
    /* commands actually active on LLD. */
    atomic_t        target_busy;
    atomic_t        target_blocked;           //當前阻塞的命令數(shù)

    /*
     * LLDs should set this in the slave_alloc host template callout.
     * If set to zero then there is not limit.
     */
    unsigned int        can_queue;             //同時處理的命令數(shù)
    unsigned int        max_target_blocked;    //阻塞命令數(shù)閥值
#define SCSI_DEFAULT_TARGET_BLOCKED 3

    char            scsi_level;                //支持的SCSI規(guī)范級別
    enum scsi_target_state  state;             //target狀態(tài)
    void            *hostdata; /* available to low-level driver */
    unsigned long       starget_data[0]; /* for the transport SCSI傳輸層(中間層)使用 */
    /* starget_data must be the last element!!!! */
} __attribute__((aligned(sizeof(unsigned long))));

邏輯設(shè)備scsi_device

scsi_device描述scsi邏輯設(shè)備,代表scsi磁盤的邏輯單元lun。scsi_device描述符所代表的設(shè)備可能是另一臺存儲設(shè)備上的SATA/SAS/SCSI磁盤或SSD。操作系統(tǒng)在掃描到連接在主機適配器上的邏輯設(shè)備時,創(chuàng)建scsi_device結(jié)構(gòu),用于scsi高層驅(qū)動和該設(shè)備通信。

struct scsi_device {
    struct Scsi_Host *host;  //所歸屬的主機總線適配器
    struct request_queue *request_queue; //請求隊列

    /* the next two are protected by the host->host_lock */
    struct list_head    siblings;   /* list of all devices on this host */ //鏈入主機總線適配器設(shè)備鏈表
    struct list_head    same_target_siblings; /* just the devices sharing same target id */ //鏈入target的設(shè)備鏈表

    atomic_t device_busy;       /* commands actually active on LLDD */
    atomic_t device_blocked;    /* Device returned QUEUE_FULL. */

    spinlock_t list_lock;
    struct list_head cmd_list;  /* queue of in use SCSI Command structures */
    struct list_head starved_entry; //鏈入主機適配器的"饑餓"鏈表
    struct scsi_cmnd *current_cmnd; /* currently active command */ //當前正在執(zhí)行的命令
    unsigned short queue_depth; /* How deep of a queue we want */
    unsigned short max_queue_depth; /* max queue depth */
    unsigned short last_queue_full_depth; /* These two are used by */
    unsigned short last_queue_full_count; /* scsi_track_queue_full() */
    unsigned long last_queue_full_time; /* last queue full time */
    unsigned long queue_ramp_up_period; /* ramp up period in jiffies */
#define SCSI_DEFAULT_RAMP_UP_PERIOD (120 * HZ)

    unsigned long last_queue_ramp_up;   /* last queue ramp up time */

    unsigned int id, channel; //scsi_device所屬的target id和所在channel通道號
    u64 lun;  //該設(shè)備的lun編號
    unsigned int manufacturer;  /* Manufacturer of device, for using  制造商
                     * vendor-specific cmd's */
    unsigned sector_size;   /* size in bytes 硬件的扇區(qū)大小 */

    void *hostdata;     /* available to low-level driver 專有數(shù)據(jù) */
    char type;          //SCSI設(shè)備類型
    char scsi_level;    //所支持SCSI規(guī)范的版本號,由INQUIRY命令獲得
    char inq_periph_qual;   /* PQ from INQUIRY data */
    unsigned char inquiry_len;  /* valid bytes in 'inquiry' */
    unsigned char * inquiry;    /* INQUIRY response data */
    const char * vendor;        /* [back_compat] point into 'inquiry' ... */
    const char * model;     /* ... after scan; point to static string */
    const char * rev;       /* ... "nullnullnullnull" before scan */

#define SCSI_VPD_PG_LEN                255
    int vpd_pg83_len;          //sense命令 0x83
    unsigned char *vpd_pg83;
    int vpd_pg80_len;          //sense命令 0x80
    unsigned char *vpd_pg80;
    unsigned char current_tag;  /* current tag */
    struct scsi_target      *sdev_target;   /* used only for single_lun */

    unsigned int    sdev_bflags; /* black/white flags as also found in
                 * scsi_devinfo.[hc]. For now used only to
                 * pass settings from slave_alloc to scsi
                 * core. */
    unsigned int eh_timeout; /* Error handling timeout */
    unsigned removable:1;
    unsigned changed:1; /* Data invalid due to media change */
    unsigned busy:1;    /* Used to prevent races */
    unsigned lockable:1;    /* Able to prevent media removal */
    unsigned locked:1;      /* Media removal disabled */
    unsigned borken:1;  /* Tell the Seagate driver to be
                 * painfully slow on this device */
    unsigned disconnect:1;  /* can disconnect */
    unsigned soft_reset:1;  /* Uses soft reset option */
    unsigned sdtr:1;    /* Device supports SDTR messages 支持同步數(shù)據(jù)傳輸 */
    unsigned wdtr:1;    /* Device supports WDTR messages 支持16位寬數(shù)據(jù)傳輸*/
    unsigned ppr:1;     /* Device supports PPR messages 支持PPR(并行協(xié)議請求)消息*/
    unsigned tagged_supported:1;    /* Supports SCSI-II tagged queuing */
    unsigned simple_tags:1; /* simple queue tag messages are enabled */
    unsigned was_reset:1;   /* There was a bus reset on the bus for
                 * this device */
    unsigned expecting_cc_ua:1; /* Expecting a CHECK_CONDITION/UNIT_ATTN
                     * because we did a bus reset. */
    unsigned use_10_for_rw:1; /* first try 10-byte read / write */
    unsigned use_10_for_ms:1; /* first try 10-byte mode sense/select */
    unsigned no_report_opcodes:1;   /* no REPORT SUPPORTED OPERATION CODES */
    unsigned no_write_same:1;   /* no WRITE SAME command */
    unsigned use_16_for_rw:1; /* Use read/write(16) over read/write(10) */
    unsigned skip_ms_page_8:1;  /* do not use MODE SENSE page 0x08 */
    unsigned skip_ms_page_3f:1; /* do not use MODE SENSE page 0x3f */
    unsigned skip_vpd_pages:1;  /* do not read VPD pages */
    unsigned try_vpd_pages:1;   /* attempt to read VPD pages */
    unsigned use_192_bytes_for_3f:1; /* ask for 192 bytes from page 0x3f */
    unsigned no_start_on_add:1; /* do not issue start on add */
    unsigned allow_restart:1; /* issue START_UNIT in error handler */
    unsigned manage_start_stop:1;   /* Let HLD (sd) manage start/stop */
    unsigned start_stop_pwr_cond:1; /* Set power cond. in START_STOP_UNIT */
    unsigned no_uld_attach:1; /* disable connecting to upper level drivers */
    unsigned select_no_atn:1;
    unsigned fix_capacity:1;    /* READ_CAPACITY is too high by 1 */
    unsigned guess_capacity:1;  /* READ_CAPACITY might be too high by 1 */
    unsigned retry_hwerror:1;   /* Retry HARDWARE_ERROR */
    unsigned last_sector_bug:1; /* do not use multisector accesses on
                       SD_LAST_BUGGY_SECTORS */
    unsigned no_read_disc_info:1;   /* Avoid READ_DISC_INFO cmds */
    unsigned no_read_capacity_16:1; /* Avoid READ_CAPACITY_16 cmds */
    unsigned try_rc_10_first:1; /* Try READ_CAPACACITY_10 first */
    unsigned is_visible:1;  /* is the device visible in sysfs */
    unsigned wce_default_on:1;  /* Cache is ON by default */
    unsigned no_dif:1;  /* T10 PI (DIF) should be disabled */
    unsigned broken_fua:1;      /* Don't set FUA bit */
    unsigned lun_in_cdb:1;      /* Store LUN bits in CDB[1] */

    atomic_t disk_events_disable_depth; /* disable depth for disk events */

    DECLARE_BITMAP(supported_events, SDEV_EVT_MAXBITS); /* supported events */
    DECLARE_BITMAP(pending_events, SDEV_EVT_MAXBITS); /* pending events */
    struct list_head event_list;    /* asserted events */
    struct work_struct event_work;

    unsigned int max_device_blocked; /* what device_blocked counts down from  */
#define SCSI_DEFAULT_DEVICE_BLOCKED 3

    atomic_t iorequest_cnt;
    atomic_t iodone_cnt;
    atomic_t ioerr_cnt;

    struct device       sdev_gendev, //內(nèi)嵌通用設(shè)備, 鏈入scsi總線類型(scsi_bus_type)的設(shè)備鏈表
                sdev_dev; //內(nèi)嵌類設(shè)備,鏈入scsi設(shè)備類(sdev_class)的設(shè)備鏈表

    struct execute_work ew; /* used to get process context on put */
    struct work_struct  requeue_work;

    struct scsi_device_handler *handler; //自定義設(shè)備處理函數(shù)
    void            *handler_data;

    enum scsi_device_state sdev_state;  //scsi設(shè)備狀態(tài)
    unsigned long       sdev_data[0];   //scsi傳輸層使用
} __attribute__((aligned(sizeof(unsigned long))));

內(nèi)核定義的SCSI命令結(jié)構(gòu)scsi_cmnd

scsi_cmnd結(jié)構(gòu)有SCSI中間層創(chuàng)建,傳遞到SCSI低層驅(qū)動。每個IO請求會被創(chuàng)建一個scsi_cnmd,但scsi_cmnd并不一定是時IO請求。scsi_cmnd最終轉(zhuǎn)化成一個具體的SCSI命令。除了命令描述塊之外,scsi_cmnd包含更豐富的信息,包括數(shù)據(jù)緩沖區(qū)、感測數(shù)據(jù)緩沖區(qū)、完成回調(diào)函數(shù)以及所關(guān)聯(lián)的塊設(shè)備驅(qū)動層請求等,是SCSI中間層執(zhí)行SCSI命令的上下文。

struct scsi_cmnd {
    struct scsi_device *device;  //指向命令所屬SCSI設(shè)備的描述符的指針
    struct list_head list;  /* scsi_cmnd participates in queue lists 鏈入scsi設(shè)備的命令鏈表 */
    struct list_head eh_entry; /* entry for the host eh_cmd_q */
    struct delayed_work abort_work;
    int eh_eflags;      /* Used by error handlr */

    /*
     * A SCSI Command is assigned a nonzero serial_number before passed
     * to the driver's queue command function.  The serial_number is
     * cleared when scsi_done is entered indicating that the command
     * has been completed.  It is a bug for LLDDs to use this number
     * for purposes other than printk (and even that is only useful
     * for debugging).
     */
    unsigned long serial_number; //scsi命令的唯一序號

    /*
     * This is set to jiffies as it was when the command was first
     * allocated.  It is used to time how long the command has
     * been outstanding
     */
    unsigned long jiffies_at_alloc; //分配時的jiffies, 用于計算命令處理時間

    int retries;  //命令重試次數(shù)
    int allowed;  //允許的重試次數(shù)

    unsigned char prot_op;    //保護操作(DIF和DIX)
    unsigned char prot_type;  //DIF保護類型
    unsigned char prot_flags;

    unsigned short cmd_len;   //命令長度
    enum dma_data_direction sc_data_direction;  //命令傳輸方向

    /* These elements define the operation we are about to perform */
    unsigned char *cmnd;  //scsi規(guī)范格式的命令字符串


    /* These elements define the operation we ultimately want to perform */
    struct scsi_data_buffer sdb;        //scsi命令數(shù)據(jù)緩沖區(qū)
    struct scsi_data_buffer *prot_sdb;  //scsi命令保護信息緩沖區(qū)

    unsigned underflow; /* Return error if less than
                   this amount is transferred */

    unsigned transfersize;  /* How much we are guaranteed to  //傳輸單位
                   transfer with each SCSI transfer
                   (ie, between disconnect /
                   reconnects.   Probably == sector
                   size */

    struct request *request;    /* The command we are  通用塊層的請求描述符
                       working on */

#define SCSI_SENSE_BUFFERSIZE   96
    unsigned char *sense_buffer;    //scsi命令感測數(shù)據(jù)緩沖區(qū)
                /* obtained by REQUEST SENSE when
                 * CHECK CONDITION is received on original
                 * command (auto-sense) */

    /* Low-level done function - can be used by low-level driver to point
     *        to completion function.  Not used by mid/upper level code. */
    void (*scsi_done) (struct scsi_cmnd *); //scsi命令在低層驅(qū)動完成時,回調(diào)

    /*
     * The following fields can be written to by the host specific code.
     * Everything else should be left alone.
     */
    struct scsi_pointer SCp;    /* Scratchpad used by some host adapters */

    unsigned char *host_scribble;   /* The host adapter is allowed to
                     * call scsi_malloc and get some memory
                     * and hang it here.  The host adapter
                     * is also expected to call scsi_free
                     * to release this memory.  (The memory
                     * obtained by scsi_malloc is guaranteed
                     * to be at an address < 16Mb). */

    int result;     /* Status code from lower level driver */
    int flags;      /* Command flags */

    unsigned char tag;  /* SCSI-II queued command tag */
};

驅(qū)動scsi_driver

struct scsi_driver {
    struct device_driver    gendrv;  // "繼承"device_driver

    void (*rescan)(struct device *); //重新掃描前調(diào)用的回調(diào)函數(shù)
    int (*init_command)(struct scsi_cmnd *);
    void (*uninit_command)(struct scsi_cmnd *);
    int (*done)(struct scsi_cmnd *);  //當?shù)蛯域?qū)動完成一個scsi命令時調(diào)用,用于計算已經(jīng)完成的字節(jié)數(shù)
    int (*eh_action)(struct scsi_cmnd *, int); //錯誤處理回調(diào)
};

設(shè)備模型

  • scsi_bus_type: scsi子系統(tǒng)總線類型

struct bus_type scsi_bus_type = {
        .name       = "scsi",   // 對應/sys/bus/scsi
        .match      = scsi_bus_match,
    .uevent     = scsi_bus_uevent,
#ifdef CONFIG_PM
    .pm     = &scsi_bus_pm_ops,
#endif
};
EXPORT_SYMBOL_GPL(scsi_bus_type);
  • shost_class: scsi子系統(tǒng)類

static struct class shost_class = {
    .name       = "scsi_host",  // 對應/sys/class/scsi_host
    .dev_release    = scsi_host_cls_release,
};

這里寫圖片描述

初始化過程

操作系統(tǒng)啟動時,會加載scsi子系統(tǒng),入口函數(shù)是init_scsi,使用subsys_initcall定義:

static int __init init_scsi(void)
{
    int error;

    error = scsi_init_queue();  //初始化聚散列表所需要的存儲池
    if (error)
        return error;
    error = scsi_init_procfs(); //初始化procfs中與scsi相關(guān)的目錄項
    if (error)
        goto cleanup_queue;
    error = scsi_init_devinfo();//設(shè)置scsi動態(tài)設(shè)備信息列表
    if (error)
        goto cleanup_procfs;
    error = scsi_init_hosts();  //注冊shost_class類,在/sys/class/目錄下創(chuàng)建scsi_host子目錄
    if (error)
        goto cleanup_devlist;
    error = scsi_init_sysctl(); //注冊SCSI系統(tǒng)控制表
    if (error)
        goto cleanup_hosts;
    error = scsi_sysfs_register(); //注冊scsi_bus_type總線類型和sdev_class類
    if (error)
        goto cleanup_sysctl;

    scsi_netlink_init(); //初始化SCSI傳輸netlink接口

    printk(KERN_NOTICE "SCSI subsystem initialized\n");
    return 0;

cleanup_sysctl:
    scsi_exit_sysctl();
cleanup_hosts:
    scsi_exit_hosts();
cleanup_devlist:
    scsi_exit_devinfo();
cleanup_procfs:
    scsi_exit_procfs();
cleanup_queue:
    scsi_exit_queue();
    printk(KERN_ERR "SCSI subsystem failed to initialize, error = %d\n",
           -error);
    return error;
}

scsi_init_hosts函數(shù)初始化scsi子系統(tǒng)主機適配器所屬的類shost_class:

int scsi_init_hosts(void)
{
    return class_register(&shost_class);
}

scsi_sysfs_register函數(shù)初始化scsi子系統(tǒng)總線類型scsi_bus_type和設(shè)備所屬的類sdev_class類:

int scsi_sysfs_register(void)
{
    int error;

    error = bus_register(&scsi_bus_type);
    if (!error) {
        error = class_register(&sdev_class);
        if (error)
            bus_unregister(&scsi_bus_type);
    }

    return error;
}

scsi低層驅(qū)動是面向主機適配器的,低層驅(qū)動被加載時,需要添加主機適配器。主機適配器添加有兩種方式:1.在PCI子系統(tǒng)掃描掛載驅(qū)動時添加;2.手動方式添加。所有基于硬件PCI接口的主機適配器都采用第一種方式。添加主機適配器包括兩個步驟: 
1. 分別主機適配器數(shù)據(jù)結(jié)構(gòu)scsi_host_alloc 
2. 將主機適配器添加到系統(tǒng)scsi_add_host

struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
{
    struct Scsi_Host *shost;
    gfp_t gfp_mask = GFP_KERNEL;

    if (sht->unchecked_isa_dma && privsize)
        gfp_mask |= __GFP_DMA;
    //一次分配Scsi_Host和私有數(shù)據(jù)空間
    shost = kzalloc(sizeof(struct Scsi_Host) + privsize, gfp_mask);
    if (!shost)
        return NULL;

    shost->host_lock = &shost->default_lock;
    spin_lock_init(shost->host_lock);
    shost->shost_state = SHOST_CREATED; //更新狀態(tài)
    INIT_LIST_HEAD(&shost->__devices);  //初始化scsi設(shè)備鏈表
    INIT_LIST_HEAD(&shost->__targets);  //初始化target鏈表
    INIT_LIST_HEAD(&shost->eh_cmd_q);   //初始化執(zhí)行錯誤的scsi命令鏈表
    INIT_LIST_HEAD(&shost->starved_list);   //初始化scsi命令饑餓鏈表
    init_waitqueue_head(&shost->host_wait);
    mutex_init(&shost->scan_mutex);

    /*
     * subtract one because we increment first then return, but we need to
     * know what the next host number was before increment
     */ //遞增分配主機適配器號
    shost->host_no = atomic_inc_return(&scsi_host_next_hn) - 1;
    shost->dma_channel = 0xff;

    /* These three are default values which can be overridden */
    shost->max_channel = 0; //默認通道號為0
    shost->max_id = 8;      //默認target最大數(shù)量
    shost->max_lun = 8;     //默認scsi_device最大數(shù)量

    /* Give each shost a default transportt */
    shost->transportt = &blank_transport_template;  //scsi傳輸層(中間層)模板

    /*
     * All drivers right now should be able to handle 12 byte
     * commands.  Every so often there are requests for 16 byte
     * commands, but individual low-level drivers need to certify that
     * they actually do something sensible with such commands.
     */
    shost->max_cmd_len = 12;  //最長的SCSI命令長度
    shost->hostt = sht;       //使用主機適配器模板
    shost->this_id = sht->this_id;
    shost->can_queue = sht->can_queue;
    shost->sg_tablesize = sht->sg_tablesize;
    shost->sg_prot_tablesize = sht->sg_prot_tablesize;
    shost->cmd_per_lun = sht->cmd_per_lun;
    shost->unchecked_isa_dma = sht->unchecked_isa_dma;
    shost->use_clustering = sht->use_clustering;
    shost->no_write_same = sht->no_write_same;

    if (shost_eh_deadline == -1 || !sht->eh_host_reset_handler)
        shost->eh_deadline = -1;
    else if ((ulong) shost_eh_deadline * HZ > INT_MAX) {
        shost_printk(KERN_WARNING, shost,
                 "eh_deadline %u too large, setting to %u\n",
                 shost_eh_deadline, INT_MAX / HZ);
        shost->eh_deadline = INT_MAX;
    } else
        shost->eh_deadline = shost_eh_deadline * HZ;

    if (sht->supported_mode == MODE_UNKNOWN) //由模板指定HBA的模式
        /* means we didn't set it ... default to INITIATOR */
        shost->active_mode = MODE_INITIATOR;  //主機適配器模式默認是initiator
    else
        shost->active_mode = sht->supported_mode;

    if (sht->max_host_blocked)
        shost->max_host_blocked = sht->max_host_blocked;
    else
        shost->max_host_blocked = SCSI_DEFAULT_HOST_BLOCKED;

    /*
     * If the driver imposes no hard sector transfer limit, start at
     * machine infinity initially.
     */
    if (sht->max_sectors)
        shost->max_sectors = sht->max_sectors;
    else
        shost->max_sectors = SCSI_DEFAULT_MAX_SECTORS;

    /*
     * assume a 4GB boundary, if not set
     */
    if (sht->dma_boundary)
        shost->dma_boundary = sht->dma_boundary;
    else
        shost->dma_boundary = 0xffffffff;  //默認DMA的邊界為4G

    shost->use_blk_mq = scsi_use_blk_mq && !shost->hostt->disable_blk_mq;

    device_initialize(&shost->shost_gendev); //初始化主機適配器內(nèi)部通用設(shè)備
    dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
    shost->shost_gendev.bus = &scsi_bus_type;   //設(shè)置主機適配器的總線類型
    shost->shost_gendev.type = &scsi_host_type; //設(shè)置主機適配器的設(shè)備類型

    device_initialize(&shost->shost_dev);    //初始化主機適配器的內(nèi)部類設(shè)備
    shost->shost_dev.parent = &shost->shost_gendev; //內(nèi)部類設(shè)備的父設(shè)備設(shè)置為其內(nèi)部通用設(shè)備
    shost->shost_dev.class = &shost_class;   //設(shè)置內(nèi)部類設(shè)備所屬的類是shost_class
    dev_set_name(&shost->shost_dev, "host%d", shost->host_no);
    shost->shost_dev.groups = scsi_sysfs_shost_attr_groups;  //設(shè)置類設(shè)備的屬性組

    shost->ehandler = kthread_run(scsi_error_handler, shost,  //啟動主機適配器的錯誤恢復內(nèi)核線程
            "scsi_eh_%d", shost->host_no);
    if (IS_ERR(shost->ehandler)) {
        shost_printk(KERN_WARNING, shost,
            "error handler thread failed to spawn, error = %ld\n",
            PTR_ERR(shost->ehandler));
        goto fail_kfree;
    }
    //分配任務(wù)管理工作隊列
    shost->tmf_work_q = alloc_workqueue("scsi_tmf_%d",
                        WQ_UNBOUND | WQ_MEM_RECLAIM,
                       1, shost->host_no);
    if (!shost->tmf_work_q) {
        shost_printk(KERN_WARNING, shost,
                 "failed to create tmf workq\n");
        goto fail_kthread;
    }
    scsi_proc_hostdir_add(shost->hostt); //在procfs中添加主機適配器的目錄, eg. //創(chuàng)建/proc/scsi/<主機適配器名稱>目錄
    return shost;

 fail_kthread:
    kthread_stop(shost->ehandler);
 fail_kfree:
    kfree(shost);
    return NULL;
}
EXPORT_SYMBOL(scsi_host_alloc);
static inline int __must_check scsi_add_host(struct Scsi_Host *host,
                         struct device *dev) //dev為父設(shè)備
{
    return scsi_add_host_with_dma(host, dev, dev);
}

int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
               struct device *dma_dev)
{
    struct scsi_host_template *sht = shost->hostt;
    int error = -EINVAL;

    shost_printk(KERN_INFO, shost, "%s\n",
            sht->info ? sht->info(shost) : sht->name);

    if (!shost->can_queue) {
        shost_printk(KERN_ERR, shost,
                 "can_queue = 0 no longer supported\n");
        goto fail;
    }

    if (shost_use_blk_mq(shost)) {         //如果主機適配器設(shè)置使用多隊列IO,則建立
        error = scsi_mq_setup_tags(shost); //相應的多隊列環(huán)境
        if (error)
            goto fail;
    } else {
        shost->bqt = blk_init_tags(shost->can_queue,
                shost->hostt->tag_alloc_policy);
        if (!shost->bqt) {
            error = -ENOMEM;
            goto fail;
        }
    }

    /*
     * Note that we allocate the freelist even for the MQ case for now,
     * as we need a command set aside for scsi_reset_provider.  Having
     * the full host freelist and one command available for that is a
     * little heavy-handed, but avoids introducing a special allocator
     * just for this.  Eventually the structure of scsi_reset_provider
     * will need a major overhaul.
     */ //分配存儲scsi命令和sense數(shù)據(jù)的緩沖區(qū), 并分配scsi命令的備用倉庫鏈表
    error = scsi_setup_command_freelist(shost);
    if (error)
        goto out_destroy_tags;

    //設(shè)置主機適配器的父設(shè)備,確定該設(shè)備在sysfs中的位置,通常會通過dev參數(shù)傳入pci_dev。
    if (!shost->shost_gendev.parent)
        shost->shost_gendev.parent = dev ? dev : &platform_bus; //如果dev為NULL,設(shè)置為platform_bus
    if (!dma_dev)
        dma_dev = shost->shost_gendev.parent;

    shost->dma_dev = dma_dev;

    error = device_add(&shost->shost_gendev);  //添加主機適配器通用設(shè)備到系統(tǒng)
    if (error)
        goto out_destroy_freelist;

    pm_runtime_set_active(&shost->shost_gendev);
    pm_runtime_enable(&shost->shost_gendev);
    device_enable_async_suspend(&shost->shost_gendev); //支持異步掛起通用設(shè)備

    scsi_host_set_state(shost, SHOST_RUNNING);  //設(shè)置主機適配器狀態(tài)
    get_device(shost->shost_gendev.parent);     //增加通用父設(shè)備的引用計數(shù)

    device_enable_async_suspend(&shost->shost_dev);  //支持異步掛起類設(shè)備

    error = device_add(&shost->shost_dev);    //添加主機適配器類設(shè)備到系統(tǒng)
    if (error)
        goto out_del_gendev;

    get_device(&shost->shost_gendev);

    if (shost->transportt->host_size) {  //scsi傳輸層使用的數(shù)據(jù)空間
        shost->shost_data = kzalloc(shost->transportt->host_size,
                     GFP_KERNEL);
        if (shost->shost_data == NULL) {
            error = -ENOMEM;
            goto out_del_dev;
        }
    }

    if (shost->transportt->create_work_queue) {
        snprintf(shost->work_q_name, sizeof(shost->work_q_name),
             "scsi_wq_%d", shost->host_no);
        shost->work_q = create_singlethread_workqueue( //分配被scsi傳輸層使用的工作隊列
                    shost->work_q_name);
        if (!shost->work_q) {
            error = -EINVAL;
            goto out_free_shost_data;
        }
    }

    error = scsi_sysfs_add_host(shost); //添加主機適配器到子系統(tǒng)
    if (error)
        goto out_destroy_host;

    scsi_proc_host_add(shost);  //在procfs添加主機適配器信息
    return error;

 out_destroy_host:
    if (shost->work_q)
        destroy_workqueue(shost->work_q);
 out_free_shost_data:
    kfree(shost->shost_data);
 out_del_dev:
    device_del(&shost->shost_dev);
 out_del_gendev:
    device_del(&shost->shost_gendev);
 out_destroy_freelist:
    scsi_destroy_command_freelist(shost);
 out_destroy_tags:
    if (shost_use_blk_mq(shost))
        scsi_mq_destroy_tags(shost);
 fail:
    return error;
}
EXPORT_SYMBOL(scsi_add_host_with_dma);

設(shè)備探測過程

在系統(tǒng)啟動過程中,會掃描默認的PCI根總線,從而觸發(fā)了PCI設(shè)備掃描的過程,開始構(gòu)造PCI設(shè)備樹,SCSI主機適配器是掛載在PCI總線的設(shè)備。SCSI主機適配器做PCI設(shè)備會被PCI總線驅(qū)動層掃描到(PCI設(shè)備的掃描采用配置空間訪問的方式),掃描到SCSI主機適配器后,操作系統(tǒng)開始加載SCSI主機適配器驅(qū)動,SCSI主機適配器驅(qū)動就是上面所說的低層驅(qū)動。SCSI主機適配器驅(qū)動根據(jù)SCSI主機適配器驅(qū)動根據(jù)SCSI主機適配模板分配SCSI主機適配器描述符,并添加到系統(tǒng),之后啟動通過SCSI主機適配器擴展出來的下一級總線–SCSI總線的掃描過程。

SCSI中間層依次以可能的ID和LUN構(gòu)造INQUIRY命令,之后將這些INQUIRY命令提交給塊IO子系統(tǒng),后者又最終將調(diào)用SCSI中間層的策略例程,再次提取到SCSI命令結(jié)構(gòu)后,調(diào)用SCSI低層驅(qū)動的queuecommand回調(diào)函數(shù)實現(xiàn)。 
對于給定ID的目標節(jié)點,如果它在SCSI總線上存在,那么它一定要實現(xiàn)對LUN0的INQUIRY響應。也就是說,如果向某個ID的目標節(jié)點的LUN0發(fā)送INQUIRY命令,或依次向各個LUN嘗試發(fā)送INQUIRY命令,檢查是否能收到響應,最終SCSI中間層能夠得到SCSI域中的所連接的邏輯設(shè)備及其信息。

SCSI總線具體的掃描方式可以由具體的主機適配器固件、主機適配器驅(qū)動實現(xiàn),在此只討論由主機適配器驅(qū)動調(diào)用scsi中間層提供通用的掃描函數(shù)的實現(xiàn)方式scsi_scan_host。

void scsi_scan_host(struct Scsi_Host *shost)
{
    struct async_scan_data *data;

    if (strncmp(scsi_scan_type, "none", 4) == 0) //檢查掃描邏輯
        return;
    if (scsi_autopm_get_host(shost) < 0)
        return;

    data = scsi_prep_async_scan(shost); //準備異步掃描
    if (!data) {
        do_scsi_scan_host(shost);    //同步掃描
        scsi_autopm_put_host(shost);
        return;
    }

    /* register with the async subsystem so wait_for_device_probe()
     * will flush this work
     */
    async_schedule(do_scan_async, data);  //異步掃描

    /* scsi_autopm_put_host(shost) is called in scsi_finish_async_scan() */
}
EXPORT_SYMBOL(scsi_scan_host);

scsi_scan_host函數(shù)是scsi中間層提供的主機適配器掃描函數(shù),對于有主機適配器驅(qū)動有自定義掃描邏輯需求的可以設(shè)置主機適配器模板的回調(diào)函數(shù),由scsi_scan_host函數(shù)來調(diào)用回調(diào)實現(xiàn)自定義掃描。 
scsi_scan_type變量指定了掃描方式:async、sync、none。無論最終掃描方式是同步還是異步,都是由do_scsi_scan_host函數(shù)實現(xiàn):

static void do_scsi_scan_host(struct Scsi_Host *shost)
{
    if (shost->hostt->scan_finished) {  //使用自定義掃描方式
        unsigned long start = jiffies;
        if (shost->hostt->scan_start)
            shost->hostt->scan_start(shost); //自定義掃描開始回調(diào)

        while (!shost->hostt->scan_finished(shost, jiffies - start)) //自定義掃描完成時返回1
            msleep(10);
    } else { //scsi子系統(tǒng)通用掃描函數(shù), SCAN_WILD_CARD表示掃描所有的target和device
        scsi_scan_host_selected(shost, SCAN_WILD_CARD, SCAN_WILD_CARD,
                SCAN_WILD_CARD, 0);
    }
}

如果主機適配器模板設(shè)置了自定義掃描函數(shù),do_scsi_scan_host函數(shù)將會調(diào)用。如果沒有設(shè)置則使用默認的掃描函數(shù)scsi_scan_host_selected執(zhí)行掃描。

int scsi_scan_host_selected(struct Scsi_Host *shost, unsigned int channel,
                unsigned int id, u64 lun, int rescan)
{
    SCSI_LOG_SCAN_BUS(3, shost_printk (KERN_INFO, shost,
        "%s: <%u:%u:%llu>\n",
        __func__, channel, id, lun));
    //檢查channel、id、lun是否有效
    if (((channel != SCAN_WILD_CARD) && (channel > shost->max_channel)) ||
        ((id != SCAN_WILD_CARD) && (id >= shost->max_id)) ||
        ((lun != SCAN_WILD_CARD) && (lun >= shost->max_lun)))
        return -EINVAL;

    mutex_lock(&shost->scan_mutex);
    if (!shost->async_scan)
        scsi_complete_async_scans();
    //檢查Scsi_Host的狀態(tài)是否允許掃描
    if (scsi_host_scan_allowed(shost) && scsi_autopm_get_host(shost) == 0) {
        if (channel == SCAN_WILD_CARD)
            for (channel = 0; channel <= shost->max_channel; //遍歷所有的channel進行掃描
                 channel++)
                scsi_scan_channel(shost, channel, id, lun,  //掃描channel
                          rescan);
        else
            scsi_scan_channel(shost, channel, id, lun, rescan); //掃描指定的channel
        scsi_autopm_put_host(shost);
    }
    mutex_unlock(&shost->scan_mutex);

    return 0;
}

scsi_scan_host_selected函數(shù)掃描指定的主機適配器,根據(jù)輸入的參數(shù)決定是否遍歷掃描所有channel或掃描指定channel,通過函數(shù)scsi_scan_channel完成。

static void scsi_scan_channel(struct Scsi_Host *shost, unsigned int channel,
                  unsigned int id, u64 lun, int rescan)
{
    uint order_id;

    if (id == SCAN_WILD_CARD)
        for (id = 0; id < shost->max_id; ++id) {  //遍歷所有的target
            /*
             * XXX adapter drivers when possible (FCP, iSCSI)
             * could modify max_id to match the current max,
             * not the absolute max.
             *
             * XXX add a shost id iterator, so for example,
             * the FC ID can be the same as a target id
             * without a huge overhead of sparse id's.
             */
            if (shost->reverse_ordering)
                /*
                 * Scan from high to low id.
                 */
                order_id = shost->max_id - id - 1;
            else
                order_id = id;
            __scsi_scan_target(&shost->shost_gendev, channel, //掃描指定的target
                    order_id, lun, rescan);
        }
    else
        __scsi_scan_target(&shost->shost_gendev, channel,
                id, lun, rescan);
}

__scsi_scan_target函數(shù)指定掃描target內(nèi)部的lun。

static void __scsi_scan_target(struct device *parent, unsigned int channel,
        unsigned int id, u64 lun, int rescan)
{
    struct Scsi_Host *shost = dev_to_shost(parent);
    int bflags = 0;
    int res;
    struct scsi_target *starget;

    if (shost->this_id == id)
        /*
         * Don't scan the host adapter
         */
        return;
    //為指定的id分配target數(shù)據(jù)結(jié)構(gòu),并初始化
    starget = scsi_alloc_target(parent, channel, id);
    if (!starget)
        return;
    scsi_autopm_get_target(starget);

    if (lun != SCAN_WILD_CARD) {
        /*
         * Scan for a specific host/chan/id/lun.
         */ //掃描target中指定id的scsi_device(lun),并將scsi_device(lun)添加到子系統(tǒng)
        scsi_probe_and_add_lun(starget, lun, NULL, NULL, rescan, NULL);
        goto out_reap;
    }

    /*
     * Scan LUN 0, if there is some response, scan further. Ideally, we
     * would not configure LUN 0 until all LUNs are scanned.
     */ //探測target的LUN0
    res = scsi_probe_and_add_lun(starget, 0, &bflags, NULL, rescan, NULL);
    if (res == SCSI_SCAN_LUN_PRESENT || res == SCSI_SCAN_TARGET_PRESENT) {
        if (scsi_report_lun_scan(starget, bflags, rescan) != 0) //向target lun 0發(fā)送REPORT_LUNS
            /*
             * The REPORT LUN did not scan the target,
             * do a sequential scan.
             */
            scsi_sequential_lun_scan(starget, bflags,  //探測REPORT_LUNS上報的lun
                         starget->scsi_level, rescan);
    }

 out_reap:
    scsi_autopm_put_target(starget);
    /*
     * paired with scsi_alloc_target(): determine if the target has
     * any children at all and if not, nuke it
     */
    scsi_target_reap(starget);

    put_device(&starget->dev);
}

掃描到target時分配并初始化scsi_target結(jié)構(gòu),scsi_probe_and_add_lun函數(shù)完成探測target中的lun,并將發(fā)現(xiàn)的lun添加到系統(tǒng)。

static int scsi_probe_and_add_lun(struct scsi_target *starget,
                  u64 lun, int *bflagsp,
                  struct scsi_device **sdevp, int rescan,
                  void *hostdata)
{
    struct scsi_device *sdev;
    unsigned char *result;
    int bflags, res = SCSI_SCAN_NO_RESPONSE, result_len = 256;
    struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);

    /*
     * The rescan flag is used as an optimization, the first scan of a
     * host adapter calls into here with rescan == 0.
     */
    sdev = scsi_device_lookup_by_target(starget, lun);  //尋找target中指定id的lun
    if (sdev) {   //target中已經(jīng)存在lun
        if (rescan || !scsi_device_created(sdev)) { //rescan參數(shù)要求重新掃描該lun
            SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,
                "scsi scan: device exists on %s\n",
                dev_name(&sdev->sdev_gendev)));
            if (sdevp)
                *sdevp = sdev;
            else
                scsi_device_put(sdev);

            if (bflagsp)
                *bflagsp = scsi_get_device_flags(sdev,
                                 sdev->vendor,
                                 sdev->model);
            return SCSI_SCAN_LUN_PRESENT;
        }
        scsi_device_put(sdev);
    } else
        sdev = scsi_alloc_sdev(starget, lun, hostdata); //target中不存在lun,分配scsi_device
    if (!sdev)
        goto out;

    result = kmalloc(result_len, GFP_ATOMIC |
            ((shost->unchecked_isa_dma) ? __GFP_DMA : 0));
    if (!result)
        goto out_free_sdev;

    if (scsi_probe_lun(sdev, result, result_len, &bflags)) //發(fā)送INQUIRY到具體device,進行探測
        goto out_free_result;

    if (bflagsp)
        *bflagsp = bflags;
    /*
     * result contains valid SCSI INQUIRY data.
     */
    if (((result[0] >> 5) == 3) && !(bflags & BLIST_ATTACH_PQ3)) {
        /*
         * For a Peripheral qualifier 3 (011b), the SCSI
         * spec says: The device server is not capable of
         * supporting a physical device on this logical
         * unit.
         *
         * For disks, this implies that there is no
         * logical disk configured at sdev->lun, but there
         * is a target id responding.
         */
        SCSI_LOG_SCAN_BUS(2, sdev_printk(KERN_INFO, sdev, "scsi scan:"
                   " peripheral qualifier of 3, device not"
                   " added\n"))
        if (lun == 0) {
            SCSI_LOG_SCAN_BUS(1, {
                unsigned char vend[9];
                unsigned char mod[17];

                sdev_printk(KERN_INFO, sdev,
                    "scsi scan: consider passing scsi_mod."
                    "dev_flags=%s:%s:0x240 or 0x1000240\n",
                    scsi_inq_str(vend, result, 8, 16),
                    scsi_inq_str(mod, result, 16, 32));
            });

        }

        res = SCSI_SCAN_TARGET_PRESENT;
        goto out_free_result;
    }

    /*
     * Some targets may set slight variations of PQ and PDT to signal
     * that no LUN is present, so don't add sdev in these cases.
     * Two specific examples are:
     * 1) NetApp targets: return PQ=1, PDT=0x1f
     * 2) USB UFI: returns PDT=0x1f, with the PQ bits being "reserved"
     *    in the UFI 1.0 spec (we cannot rely on reserved bits).
     *
     * References:
     * 1) SCSI SPC-3, pp. 145-146
     * PQ=1: "A peripheral device having the specified peripheral
     * device type is not connected to this logical unit. However, the
     * device server is capable of supporting the specified peripheral
     * device type on this logical unit."
     * PDT=0x1f: "Unknown or no device type"
     * 2) USB UFI 1.0, p. 20
     * PDT=00h Direct-access device (floppy)
     * PDT=1Fh none (no FDD connected to the requested logical unit)
     */
    if (((result[0] >> 5) == 1 || starget->pdt_1f_for_no_lun) &&
        (result[0] & 0x1f) == 0x1f &&
        !scsi_is_wlun(lun)) {
        SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,
                    "scsi scan: peripheral device type"
                    " of 31, no device added\n"));
        res = SCSI_SCAN_TARGET_PRESENT;
        goto out_free_result;
    }
    //添加scsi設(shè)備到子系統(tǒng)
    res = scsi_add_lun(sdev, result, &bflags, shost->async_scan);
    if (res == SCSI_SCAN_LUN_PRESENT) {
        if (bflags & BLIST_KEY) {
            sdev->lockable = 0;
            scsi_unlock_floptical(sdev, result);
        }
    }

 out_free_result:
    kfree(result);
 out_free_sdev:
    if (res == SCSI_SCAN_LUN_PRESENT) {
        if (sdevp) {
            if (scsi_device_get(sdev) == 0) {
                *sdevp = sdev;
            } else {
                __scsi_remove_device(sdev);
                res = SCSI_SCAN_NO_RESPONSE;
            }
        }
    } else
        __scsi_remove_device(sdev);
 out:
    return res;
}

scsi_probe_and_add_lun函數(shù)由名字可知,完成lun的probe和add兩個操作: 
1. 探測邏輯設(shè)備scsi_probe_lun,發(fā)送INQUIRY命令到具體設(shè)備。 
2. 添加邏輯設(shè)備到系統(tǒng)scsi_add_lun,根據(jù)INQUIRY命令返回值添加lun到系統(tǒng)。

static int scsi_probe_lun(struct scsi_device *sdev, unsigned char *inq_result,
              int result_len, int *bflags)
{
    unsigned char scsi_cmd[MAX_COMMAND_SIZE];
    int first_inquiry_len, try_inquiry_len, next_inquiry_len;
    int response_len = 0;
    int pass, count, result;
    struct scsi_sense_hdr sshdr;

    *bflags = 0;

    /* Perform up to 3 passes.  The first pass uses a conservative
     * transfer length of 36 unless sdev->inquiry_len specifies a
     * different value. */
    first_inquiry_len = sdev->inquiry_len ? sdev->inquiry_len : 36;
    try_inquiry_len = first_inquiry_len;
    pass = 1;

 next_pass:
    SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,
                "scsi scan: INQUIRY pass %d length %d\n",
                pass, try_inquiry_len));

    /* Each pass gets up to three chances to ignore Unit Attention */
    for (count = 0; count < 3; ++count) {
        int resid;

        memset(scsi_cmd, 0, 6);
        scsi_cmd[0] = INQUIRY;      //命令類型是INQUIRY
        scsi_cmd[4] = (unsigned char) try_inquiry_len;

        memset(inq_result, 0, try_inquiry_len);
        //發(fā)送SCSI命令,重試3次
        result = scsi_execute_req(sdev,  scsi_cmd, DMA_FROM_DEVICE,
                      inq_result, try_inquiry_len, &sshdr,
                      HZ / 2 + HZ * scsi_inq_timeout, 3,
                      &resid);

        SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,
                "scsi scan: INQUIRY %s with code 0x%x\n",
                result ? "failed" : "successful", result));

        if (result) {
            /*
             * not-ready to ready transition [asc/ascq=0x28/0x0]
             * or power-on, reset [asc/ascq=0x29/0x0], continue.
             * INQUIRY should not yield UNIT_ATTENTION
             * but many buggy devices do so anyway.
             */
            if ((driver_byte(result) & DRIVER_SENSE) &&
                scsi_sense_valid(&sshdr)) {
                if ((sshdr.sense_key == UNIT_ATTENTION) &&
                    ((sshdr.asc == 0x28) ||
                     (sshdr.asc == 0x29)) &&
                    (sshdr.ascq == 0))
                    continue;
            }
        } else {
            /*
             * if nothing was transferred, we try
             * again. It's a workaround for some USB
             * devices.
             */
            if (resid == try_inquiry_len)
                continue;
        }
        break;
    }

    if (result == 0) {
        sanitize_inquiry_string(&inq_result[8], 8);
        sanitize_inquiry_string(&inq_result[16], 16);
        sanitize_inquiry_string(&inq_result[32], 4);

        response_len = inq_result[4] + 5;
        if (response_len > 255)
            response_len = first_inquiry_len;   /* sanity */

        /*
         * Get any flags for this device.
         *
         * XXX add a bflags to scsi_device, and replace the
         * corresponding bit fields in scsi_device, so bflags
         * need not be passed as an argument.
         */
        *bflags = scsi_get_device_flags(sdev, &inq_result[8],
                &inq_result[16]);

        /* When the first pass succeeds we gain information about
         * what larger transfer lengths might work. */
        if (pass == 1) {
            if (BLIST_INQUIRY_36 & *bflags)
                next_inquiry_len = 36;
            else if (BLIST_INQUIRY_58 & *bflags)
                next_inquiry_len = 58;
            else if (sdev->inquiry_len)
                next_inquiry_len = sdev->inquiry_len;
            else
                next_inquiry_len = response_len;

            /* If more data is available perform the second pass */
            if (next_inquiry_len > try_inquiry_len) {
                try_inquiry_len = next_inquiry_len;
                pass = 2;
                goto next_pass;
            }
        }

    } else if (pass == 2) {
        sdev_printk(KERN_INFO, sdev,
                "scsi scan: %d byte inquiry failed.  "
                "Consider BLIST_INQUIRY_36 for this device\n",
                try_inquiry_len);

        /* If this pass failed, the third pass goes back and transfers
         * the same amount as we successfully got in the first pass. */
        try_inquiry_len = first_inquiry_len;
        pass = 3;
        goto next_pass;
    }

    /* If the last transfer attempt got an error, assume the
     * peripheral doesn't exist or is dead. */
    if (result)
        return -EIO;

    /* Don't report any more data than the device says is valid */
    sdev->inquiry_len = min(try_inquiry_len, response_len);

    /*
     * XXX Abort if the response length is less than 36? If less than
     * 32, the lookup of the device flags (above) could be invalid,
     * and it would be possible to take an incorrect action - we do
     * not want to hang because of a short INQUIRY. On the flip side,
     * if the device is spun down or becoming ready (and so it gives a
     * short INQUIRY), an abort here prevents any further use of the
     * device, including spin up.
     *
     * On the whole, the best approach seems to be to assume the first
     * 36 bytes are valid no matter what the device says.  That's
     * better than copying < 36 bytes to the inquiry-result buffer
     * and displaying garbage for the Vendor, Product, or Revision
     * strings.
     */
    if (sdev->inquiry_len < 36) {
        if (!sdev->host->short_inquiry) {
            shost_printk(KERN_INFO, sdev->host,
                    "scsi scan: INQUIRY result too short (%d),"
                    " using 36\n", sdev->inquiry_len);
            sdev->host->short_inquiry = 1;
        }
        sdev->inquiry_len = 36;
    }

    /*
     * Related to the above issue:
     *
     * XXX Devices (disk or all?) should be sent a TEST UNIT READY,
     * and if not ready, sent a START_STOP to start (maybe spin up) and
     * then send the INQUIRY again, since the INQUIRY can change after
     * a device is initialized.
     *
     * Ideally, start a device if explicitly asked to do so.  This
     * assumes that a device is spun up on power on, spun down on
     * request, and then spun up on request.
     */

    /*
     * The scanning code needs to know the scsi_level, even if no
     * device is attached at LUN 0 (SCSI_SCAN_TARGET_PRESENT) so
     * non-zero LUNs can be scanned.
     */
    sdev->scsi_level = inq_result[2] & 0x07;
    if (sdev->scsi_level >= 2 ||
        (sdev->scsi_level == 1 && (inq_result[3] & 0x0f) == 1))
        sdev->scsi_level++;
    sdev->sdev_target->scsi_level = sdev->scsi_level;

    /*
     * If SCSI-2 or lower, and if the transport requires it,
     * store the LUN value in CDB[1].
     */
    sdev->lun_in_cdb = 0;
    if (sdev->scsi_level <= SCSI_2 &&
        sdev->scsi_level != SCSI_UNKNOWN &&
        !sdev->host->no_scsi2_lun_in_cdb)
        sdev->lun_in_cdb = 1;

    return 0;
}


static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
        int *bflags, int async)
{
    int ret;

    /*
     * XXX do not save the inquiry, since it can change underneath us,
     * save just vendor/model/rev.
     *
     * Rather than save it and have an ioctl that retrieves the saved
     * value, have an ioctl that executes the same INQUIRY code used
     * in scsi_probe_lun, let user level programs doing INQUIRY
     * scanning run at their own risk, or supply a user level program
     * that can correctly scan.
     */

    /*
     * Copy at least 36 bytes of INQUIRY data, so that we don't
     * dereference unallocated memory when accessing the Vendor,
     * Product, and Revision strings.  Badly behaved devices may set
     * the INQUIRY Additional Length byte to a small value, indicating
     * these strings are invalid, but often they contain plausible data
     * nonetheless.  It doesn't matter if the device sent < 36 bytes
     * total, since scsi_probe_lun() initializes inq_result with 0s.
     */
    sdev->inquiry = kmemdup(inq_result,
                max_t(size_t, sdev->inquiry_len, 36),
                GFP_ATOMIC);
    if (sdev->inquiry == NULL)
        return SCSI_SCAN_NO_RESPONSE;

    sdev->vendor = (char *) (sdev->inquiry + 8); //第8個字節(jié)到第15個字節(jié)是vendor identification
    sdev->model = (char *) (sdev->inquiry + 16); //第16個字節(jié)到第31個字節(jié)是product identification
    sdev->rev = (char *) (sdev->inquiry + 32);   //第32個字節(jié)到第35個字節(jié)是product revision level

    if (strncmp(sdev->vendor, "ATA     ", 8) == 0) {
        /*
         * sata emulation layer device.  This is a hack to work around
         * the SATL power management specifications which state that
         * when the SATL detects the device has gone into standby
         * mode, it shall respond with NOT READY.
         */
        sdev->allow_restart = 1;
    }

    if (*bflags & BLIST_ISROM) {
        sdev->type = TYPE_ROM;
        sdev->removable = 1;
    } else {
        sdev->type = (inq_result[0] & 0x1f);
        sdev->removable = (inq_result[1] & 0x80) >> 7;

        /*
         * some devices may respond with wrong type for
         * well-known logical units. Force well-known type
         * to enumerate them correctly.
         */
        if (scsi_is_wlun(sdev->lun) && sdev->type != TYPE_WLUN) {
            sdev_printk(KERN_WARNING, sdev,
                "%s: correcting incorrect peripheral device type 0x%x for W-LUN 0x%16xhN\n",
                __func__, sdev->type, (unsigned int)sdev->lun);
            sdev->type = TYPE_WLUN;
        }

    }

    if (sdev->type == TYPE_RBC || sdev->type == TYPE_ROM) {
        /* RBC and MMC devices can return SCSI-3 compliance and yet
         * still not support REPORT LUNS, so make them act as
         * BLIST_NOREPORTLUN unless BLIST_REPORTLUN2 is
         * specifically set */
        if ((*bflags & BLIST_REPORTLUN2) == 0)
            *bflags |= BLIST_NOREPORTLUN;
    }

    /*
     * For a peripheral qualifier (PQ) value of 1 (001b), the SCSI
     * spec says: The device server is capable of supporting the
     * specified peripheral device type on this logical unit. However,
     * the physical device is not currently connected to this logical
     * unit.
     *
     * The above is vague, as it implies that we could treat 001 and
     * 011 the same. Stay compatible with previous code, and create a
     * scsi_device for a PQ of 1
     *
     * Don't set the device offline here; rather let the upper
     * level drivers eval the PQ to decide whether they should
     * attach. So remove ((inq_result[0] >> 5) & 7) == 1 check.
     */

    sdev->inq_periph_qual = (inq_result[0] >> 5) & 7;
    sdev->lockable = sdev->removable;
    sdev->soft_reset = (inq_result[7] & 1) && ((inq_result[3] & 7) == 2);

    if (sdev->scsi_level >= SCSI_3 ||
            (sdev->inquiry_len > 56 && inq_result[56] & 0x04))
        sdev->ppr = 1;
    if (inq_result[7] & 0x60)
        sdev->wdtr = 1;
    if (inq_result[7] & 0x10)
        sdev->sdtr = 1;

    sdev_printk(KERN_NOTICE, sdev, "%s %.8s %.16s %.4s PQ: %d "
            "ANSI: %d%s\n", scsi_device_type(sdev->type),
            sdev->vendor, sdev->model, sdev->rev,
            sdev->inq_periph_qual, inq_result[2] & 0x07,
            (inq_result[3] & 0x0f) == 1 ? " CCS" : "");

    if ((sdev->scsi_level >= SCSI_2) && (inq_result[7] & 2) &&
        !(*bflags & BLIST_NOTQ)) {
        sdev->tagged_supported = 1;
        sdev->simple_tags = 1;
    }

    /*
     * Some devices (Texel CD ROM drives) have handshaking problems
     * when used with the Seagate controllers. borken is initialized
     * to 1, and then set it to 0 here.
     */
    if ((*bflags & BLIST_BORKEN) == 0)
        sdev->borken = 0;

    if (*bflags & BLIST_NO_ULD_ATTACH)
        sdev->no_uld_attach = 1;

    /*
     * Apparently some really broken devices (contrary to the SCSI
     * standards) need to be selected without asserting ATN
     */
    if (*bflags & BLIST_SELECT_NO_ATN)
        sdev->select_no_atn = 1;

    /*
     * Maximum 512 sector transfer length
     * broken RA4x00 Compaq Disk Array
     */
    if (*bflags & BLIST_MAX_512)
        blk_queue_max_hw_sectors(sdev->request_queue, 512);
    /*
     * Max 1024 sector transfer length for targets that report incorrect
     * max/optimal lengths and relied on the old block layer safe default
     */
    else if (*bflags & BLIST_MAX_1024)
        blk_queue_max_hw_sectors(sdev->request_queue, 1024);

    /*
     * Some devices may not want to have a start command automatically
     * issued when a device is added.
     */
    if (*bflags & BLIST_NOSTARTONADD)
        sdev->no_start_on_add = 1;

    if (*bflags & BLIST_SINGLELUN)
        scsi_target(sdev)->single_lun = 1;

    sdev->use_10_for_rw = 1;

    if (*bflags & BLIST_MS_SKIP_PAGE_08)
        sdev->skip_ms_page_8 = 1;

    if (*bflags & BLIST_MS_SKIP_PAGE_3F)
        sdev->skip_ms_page_3f = 1;

    if (*bflags & BLIST_USE_10_BYTE_MS)
        sdev->use_10_for_ms = 1;

    /* some devices don't like REPORT SUPPORTED OPERATION CODES
     * and will simply timeout causing sd_mod init to take a very
     * very long time */
    if (*bflags & BLIST_NO_RSOC)
        sdev->no_report_opcodes = 1;

    /* set the device running here so that slave configure
     * may do I/O */
    ret = scsi_device_set_state(sdev, SDEV_RUNNING); //狀態(tài)
    if (ret) {
        ret = scsi_device_set_state(sdev, SDEV_BLOCK);

        if (ret) {
            sdev_printk(KERN_ERR, sdev,
                    "in wrong state %s to complete scan\n",
                    scsi_device_state_name(sdev->sdev_state));
            return SCSI_SCAN_NO_RESPONSE;
        }
    }

    if (*bflags & BLIST_MS_192_BYTES_FOR_3F)
        sdev->use_192_bytes_for_3f = 1;

    if (*bflags & BLIST_NOT_LOCKABLE)
        sdev->lockable = 0;

    if (*bflags & BLIST_RETRY_HWERROR)
        sdev->retry_hwerror = 1;

    if (*bflags & BLIST_NO_DIF)
        sdev->no_dif = 1;

    sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT;

    if (*bflags & BLIST_TRY_VPD_PAGES)
        sdev->try_vpd_pages = 1;
    else if (*bflags & BLIST_SKIP_VPD_PAGES)
        sdev->skip_vpd_pages = 1;

    transport_configure_device(&sdev->sdev_gendev); //把lun配置到scsi傳輸層

    if (sdev->host->hostt->slave_configure) {
        ret = sdev->host->hostt->slave_configure(sdev); //主機適配器模板設(shè)置的回調(diào),對scsi_device(lun)執(zhí)行特定的初始化
        if (ret) {
            /*
             * if LLDD reports slave not present, don't clutter
             * console with alloc failure messages
             */
            if (ret != -ENXIO) {
                sdev_printk(KERN_ERR, sdev,
                    "failed to configure device\n");
            }
            return SCSI_SCAN_NO_RESPONSE;
        }
    }

    if (sdev->scsi_level >= SCSI_3)
        scsi_attach_vpd(sdev);

    sdev->max_queue_depth = sdev->queue_depth;  //設(shè)置最大隊列深度

    /*
     * Ok, the device is now all set up, we can
     * register it and tell the rest of the kernel
     * about it.
     */ //添加scsi_device(lun)到sysfs
    if (!async && scsi_sysfs_add_sdev(sdev) != 0)
        return SCSI_SCAN_NO_RESPONSE;

    return SCSI_SCAN_LUN_PRESENT;
}

    本站是提供個人知識管理的網(wǎng)絡(luò)存儲空間,所有內(nèi)容均由用戶發(fā)布,不代表本站觀點。請注意甄別內(nèi)容中的聯(lián)系方式、誘導購買等信息,謹防詐騙。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容,請點擊一鍵舉報。
    轉(zhuǎn)藏 分享 獻花(0

    0條評論

    發(fā)表

    請遵守用戶 評論公約

    類似文章 更多