|
本系列文章將自底向上分析Linux存儲IO棧源碼(基于4.4.19),為學習Linux存儲做記錄。具體目錄如下: 一、 Linux內(nèi)核對象與對象集 二、 sysfs 三、 設(shè)備模型 四、 SCSI子系統(tǒng) Linux內(nèi)核對象和對象集內(nèi)核對象作為Linux設(shè)備驅(qū)動模型的基礎(chǔ),主要是抽象和封裝總線、設(shè)備、驅(qū)動、類和接口之間的關(guān)系具體實現(xiàn)的相關(guān)代碼,并在sysfs中呈現(xiàn)。主要抽象成kobject和kset結(jié)構(gòu): struct kobject {
const char *name; //在sysfs中顯示的名稱
struct list_head entry; //鏈入kset的kobj鏈表
struct kobject *parent; //指向父kobject,用于表示樹形結(jié)構(gòu)
struct kset *kset; //指向鏈入的kset
struct kobj_type *ktype; //抽象kobject的通用方法和屬性
struct kernfs_node *sd; //sysfs directory entry
struct kref kref; //引用計數(shù)
#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
struct delayed_work release;
#endif
unsigned int state_initialized:1; //是否被初始化
unsigned int state_in_sysfs:1; //是否被添加到sysfs
unsigned int state_add_uevent_sent:1; //是否發(fā)送ADD事件到用戶空間
unsigned int state_remove_uevent_sent:1; //是否發(fā)送REMOVE事件到用戶空間
unsigned int uevent_suppress:1; //事件是否被抑制
};在kobject結(jié)構(gòu)中ktype域是對kobject一些通用方法和屬性進行封裝: struct kobj_type {
void (*release)(struct kobject *kobj); //釋放kobject結(jié)構(gòu)時回調(diào)
const struct sysfs_ops *sysfs_ops; //sysfs的操作函數(shù)
struct attribute **default_attrs; //默認屬性
//命名空間相關(guān)操作
const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj);
const void *(*namespace)(struct kobject *kobj);
};kset是一組kobject的集合,通過kset可以遍歷這組kobject,如SCSI子系統(tǒng)中,設(shè)備是一種kobject,通過設(shè)備集kset,可以遍歷所有的設(shè)備。 /**
* struct kset - a set of kobjects of a specific type, belonging to a specific subsystem.
*
* A kset defines a group of kobjects. They can be individually
* different "types" but overall these kobjects all want to be grouped
* together and operated on in the same manner. ksets are used to
* define the attribute callbacks and other common events that happen to
* a kobject.
*
* @list: the list of all kobjects for this kset
* @list_lock: a lock for iterating over the kobjects
* @kobj: the embedded kobject for this kset (recursion, isn't it fun...)
* @uevent_ops: the set of uevent operations for this kset. These are
* called whenever a kobject has something happen to it so that the kset
* can add new environment variables, or filter out the uevents if so
* desired.
*/
struct kset {
struct list_head list; //鏈入kset的kobject鏈表
spinlock_t list_lock; //遍歷鏈表是的自旋鎖struct kobject kobj; //本身可以當做kobject對待
const struct kset_uevent_ops *uevent_ops; //發(fā)送uevent事件的回調(diào)函數(shù)
};在發(fā)送事件到用戶空間時,可以回調(diào)kset_uevent_ops中的3個回調(diào)函數(shù) struct kset_uevent_ops {
int (* const filter)(struct kset *kset, struct kobject *kobj);
const char *(* const name)(struct kset *kset, struct kobject *kobj);
int (* const uevent)(struct kset *kset, struct kobject *kobj,
struct kobj_uevent_env *env);
};
內(nèi)核對象相關(guān)操作void kobject_init(struct kobject *kobj, struct kobj_type *ktype); int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...); int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype, struct kobject *parent, const char *fmt, ...); void kobject_del(struct kobject *kobj); struct kobject * kobject_create(void); struct kobject * kobject_create_and_add(const char *name, struct kobject *parent); int kobject_rename(struct kobject *, const char *new_name); int kobject_move(struct kobject *, struct kobject *); struct kobject *kobject_get(struct kobject *kobj); void kobject_put(struct kobject *kobj); const void *kobject_namespace(struct kobject *kobj); char *kobject_get_path(struct kobject *kobj, gfp_t flag); 內(nèi)核對象創(chuàng)建及初始化初始化流程主要在kobject_init: /**
* kobject_init - initialize a kobject structure
* @kobj: pointer to the kobject to initialize
* @ktype: pointer to the ktype for this kobject.
*
* This function will properly initialize a kobject such that it can then
* be passed to the kobject_add() call.
*
* After this function is called, the kobject MUST be cleaned up by a call
* to kobject_put(), not by a call to kfree directly to ensure that all of
* the memory is cleaned up properly.
*/
void kobject_init(struct kobject *kobj, struct kobj_type *ktype)
{
char *err_str;
if (!kobj) {
err_str = "invalid kobject pointer!";
goto error;
}
if (!ktype) {
err_str = "must have a ktype to be initialized properly!\n";
goto error;
}
if (kobj->state_initialized) { //避免重復初始化
/* do not error out as sometimes we can recover */
printk(KERN_ERR "kobject (%p): tried to init an initialized "
"object, something is seriously wrong.\n", kobj);
dump_stack();
}
kobject_init_internal(kobj); //完成初始化的主要函數(shù)
kobj->ktype = ktype;
return;
error:
printk(KERN_ERR "kobject (%p): %s\n", kobj, err_str);
dump_stack();
}
EXPORT_SYMBOL(kobject_init);由上面函數(shù)可以看出由kobject_init_internal完成初始化: static void kobject_init_internal(struct kobject *kobj)
{
if (!kobj)
return;
kref_init(&kobj->kref);
INIT_LIST_HEAD(&kobj->entry);
kobj->state_in_sysfs = 0;
kobj->state_add_uevent_sent = 0;
kobj->state_remove_uevent_sent = 0;
kobj->state_initialized = 1;
}kobject_create函數(shù)僅僅是在調(diào)用kobject_init之前,先分配kobject空間。在kobject初始化之后,需要調(diào)用kobject_add將kobject添加到sysfs中。 /**
* kobject_add - the main kobject add function
* @kobj: the kobject to add
* @parent: pointer to the parent of the kobject.
* @fmt: format to name the kobject with.
*
* The kobject name is set and added to the kobject hierarchy in this
* function.
*
* If @parent is set, then the parent of the @kobj will be set to it.
* If @parent is NULL, then the parent of the @kobj will be set to the
* kobject associated with the kset assigned to this kobject. If no kset
* is assigned to the kobject, then the kobject will be located in the
* root of the sysfs tree.
*
* If this function returns an error, kobject_put() must be called to
* properly clean up the memory associated with the object.
* Under no instance should the kobject that is passed to this function
* be directly freed with a call to kfree(), that can leak memory.
*
* Note, no "add" uevent will be created with this call, the caller should set
* up all of the necessary sysfs files for the object and then call
* kobject_uevent() with the UEVENT_ADD parameter to ensure that
* userspace is properly notified of this kobject's creation.
*/
int kobject_add(struct kobject *kobj, struct kobject *parent,
const char *fmt, ...)
{
va_list args;
int retval;
if (!kobj)
return -EINVAL;
if (!kobj->state_initialized) { //add之前需要初始化
printk(KERN_ERR "kobject '%s' (%p): tried to add an "
"uninitialized object, something is seriously wrong.\n",
kobject_name(kobj), kobj);
dump_stack();
return -EINVAL;
}
va_start(args, fmt);
retval = kobject_add_varg(kobj, parent, fmt, args); //主要完成add操作
va_end(args);
return retval;
}
EXPORT_SYMBOL(kobject_add);kobject_add_varg/kobject_add_internal主要完成將kobject添加到sysfs的操作: static __printf(3, 0) int kobject_add_varg(struct kobject *kobj,
struct kobject *parent,
const char *fmt, va_list vargs)
{
int retval;
//設(shè)置kobject在sysfs中顯示的名稱
retval = kobject_set_name_vargs(kobj, fmt, vargs);
if (retval) {
printk(KERN_ERR "kobject: can not set name properly!\n");
return retval;
}
kobj->parent = parent;
return kobject_add_internal(kobj); //主要實現(xiàn)函數(shù)
}
static int kobject_add_internal(struct kobject *kobj)
{
int error = 0;
struct kobject *parent;
if (!kobj)
return -ENOENT;
if (!kobj->name || !kobj->name[0]) {
WARN(1, "kobject: (%p): attempted to be registered with empty "
"name!\n", kobj);
return -EINVAL;
}
parent = kobject_get(kobj->parent); //增加父對象的引用計數(shù)
/* join kset if set, use it as parent if we do not already have one */
if (kobj->kset) { //如果設(shè)置了kset,而沒有設(shè)置parent,則把kset的kobject設(shè)置為parent
if (!parent)
parent = kobject_get(&kobj->kset->kobj);
kobj_kset_join(kobj);
kobj->parent = parent;
}
pr_debug("kobject: '%s' (%p): %s: parent: '%s', set: '%s'\n",
kobject_name(kobj), kobj, __func__,
parent ? kobject_name(parent) : "<NULL>",
kobj->kset ? kobject_name(&kobj->kset->kobj) : "<NULL>");
error = create_dir(kobj); //創(chuàng)建sysfs對應的目錄和屬性文件
if (error) { //出錯回滾
kobj_kset_leave(kobj);
kobject_put(parent);
kobj->parent = NULL;
/* be noisy on error issues */
if (error == -EEXIST)
WARN(1, "%s failed for %s with "
"-EEXIST, don't try to register things with "
"the same name in the same directory.\n",
__func__, kobject_name(kobj));
else
WARN(1, "%s failed for %s (error: %d parent: %s)\n",
__func__, kobject_name(kobj), error,
parent ? kobject_name(parent) : "'none'");
} else
kobj->state_in_sysfs = 1; //更新標志位
return error;
}由create_dir在sysfs創(chuàng)建真實的目錄和文件,這點有下一篇sysfs詳細描述。理解了kobject_init和kobject_add之后,由名字可以知道下面函數(shù)kobject_init_and_add和kobject_create_and_add 內(nèi)核對象釋放調(diào)用kobject_del將對kobject釋放: /**
* kobject_del - unlink kobject from hierarchy.
* @kobj: object.
*/
void kobject_del(struct kobject *kobj)
{
struct kernfs_node *sd;
if (!kobj)
return;
sd = kobj->sd;
sysfs_remove_dir(kobj); //刪除kobject在sysfs中的目錄
sysfs_put(sd);
kobj->state_in_sysfs = 0; //設(shè)置標志位
kobj_kset_leave(kobj); //kobject脫離kset鏈表
kobject_put(kobj->parent); //調(diào)用kobject_release釋放
kobj->parent = NULL;
}
EXPORT_SYMBOL(kobject_del);
/**
* kobject_put - decrement refcount for object.
* @kobj: object.
*
* Decrement the refcount, and if 0, call kobject_cleanup().
*/
void kobject_put(struct kobject *kobj)
{
if (kobj) {
if (!kobj->state_initialized)
WARN(1, KERN_WARNING "kobject: '%s' (%p): is not "
"initialized, yet kobject_put() is being "
"called.\n", kobject_name(kobj), kobj);
kref_put(&kobj->kref, kobject_release); //調(diào)用kobject_release
}
}
EXPORT_SYMBOL(kobject_put);
static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
{
return kref_sub(kref, 1, release);
}
static inline int kref_sub(struct kref *kref, unsigned int count,
void (*release)(struct kref *kref))
{
WARN_ON(release == NULL);
if (atomic_sub_and_test((int) count, &kref->refcount)) {
release(kref); //調(diào)用kobject_release
return 1;
}
return 0;
}根據(jù)上面的代碼追蹤,得知kobject_release才是釋放kobject的主角: static void kobject_release(struct kref *kref)
{
struct kobject *kobj = container_of(kref, struct kobject, kref);
#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
unsigned long delay = HZ + HZ * (get_random_int() & 0x3);
pr_info("kobject: '%s' (%p): %s, parent %p (delayed %ld)\n",
kobject_name(kobj), kobj, __func__, kobj->parent, delay);
INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup);
//延遲調(diào)用kobject_delayed_cleanup進行清理
schedule_delayed_work(&kobj->release, delay);
#else
kobject_cleanup(kobj); //清理
#endif
}如果在內(nèi)核編譯時指定CONFIG_DEBUG_KOBJECT_RELEASE,則使用延遲release方式調(diào)用kobject_delayed_cleanup,否則直接調(diào)用kobject_cleanup。 #ifdef CONFIG_DEBUG_KOBJECT_RELEASE
static void kobject_delayed_cleanup(struct work_struct *work)
{
kobject_cleanup(container_of(to_delayed_work(work), //最終還是調(diào)用
struct kobject, release));
}
#endif
/*
* kobject_cleanup - free kobject resources.
* @kobj: object to cleanup
*/
static void kobject_cleanup(struct kobject *kobj)
{
struct kobj_type *t = get_ktype(kobj);
const char *name = kobj->name;
pr_debug("kobject: '%s' (%p): %s, parent %p\n",
kobject_name(kobj), kobj, __func__, kobj->parent);
if (t && !t->release)
pr_debug("kobject: '%s' (%p): does not have a release() "
"function, it is broken and must be fixed.\n",
kobject_name(kobj), kobj);
/* send "remove" if the caller did not do it but sent "add" */
if (kobj->state_add_uevent_sent && !kobj->state_remove_uevent_sent) {
pr_debug("kobject: '%s' (%p): auto cleanup 'remove' event\n",
kobject_name(kobj), kobj);
kobject_uevent(kobj, KOBJ_REMOVE); //僅僅發(fā)送一次REMOVE消息
}
/* remove from sysfs if the caller did not do it */
if (kobj->state_in_sysfs) {
pr_debug("kobject: '%s' (%p): auto cleanup kobject_del\n",
kobject_name(kobj), kobj);
kobject_del(kobj); //如果調(diào)用者沒有清理sysfs,則清理
}
if (t && t->release) {
pr_debug("kobject: '%s' (%p): calling ktype release\n",
kobject_name(kobj), kobj);
t->release(kobj); //調(diào)用kobj_type的release回調(diào)函數(shù)
}
/* free name if we allocated it */
if (name) {
pr_debug("kobject: '%s': free name\n", name);
kfree_const(name);
}
}內(nèi)核對象集相關(guān)操作void kset_init(struct kset *kset); struct kset *kset_create(const char *name, const struct kset_uevent_ops *uevent_ops, struct kobject *parent_kobj); int kset_register(struct kset *kset); void kset_unregister(struct kset *kset); struct kset * kset_create_and_add(const char *name, const struct kset_uevent_ops *u, struct kobject *parent_kobj); 內(nèi)核對象集創(chuàng)建及初始化內(nèi)核對象集由kset_create創(chuàng)建 /**
* kset_create - create a struct kset dynamically
*
* @name: the name for the kset
* @uevent_ops: a struct kset_uevent_ops for the kset
* @parent_kobj: the parent kobject of this kset, if any.
*
* This function creates a kset structure dynamically. This structure can
* then be registered with the system and show up in sysfs with a call to
* kset_register(). When you are finished with this structure, if
* kset_register() has been called, call kset_unregister() and the
* structure will be dynamically freed when it is no longer being used.
*
* If the kset was not able to be created, NULL will be returned.
*/
static struct kset *kset_create(const char *name,
const struct kset_uevent_ops *uevent_ops,
struct kobject *parent_kobj)
{
struct kset *kset;
int retval;
kset = kzalloc(sizeof(*kset), GFP_KERNEL); //分配空間
if (!kset)
return NULL;
retval = kobject_set_name(&kset->kobj, "%s", name); //設(shè)置kset在sysfs中的名字
if (retval) {
kfree(kset);
return NULL;
}
kset->uevent_ops = uevent_ops; //設(shè)置uevent_ops
kset->kobj.parent = parent_kobj; //設(shè)置kset的父對象
/*
* The kobject of this kset will have a type of kset_ktype and belong to
* no kset itself. That way we can properly free it when it is
* finished being used.
*/
kset->kobj.ktype = &kset_ktype; //設(shè)置kobj_type
kset->kobj.kset = NULL;
return kset;
}內(nèi)核對象集由kset_init執(zhí)行初始化: /**
* kset_init - initialize a kset for use
* @k: kset
*/
void kset_init(struct kset *k)
{
kobject_init_internal(&k->kobj); //這里初始化
INIT_LIST_HEAD(&k->list);
spin_lock_init(&k->list_lock);
}
static void kobject_init_internal(struct kobject *kobj)
{
if (!kobj)
return;
kref_init(&kobj->kref);
INIT_LIST_HEAD(&kobj->entry);
kobj->state_in_sysfs = 0; //設(shè)置對應標志位
kobj->state_add_uevent_sent = 0;
kobj->state_remove_uevent_sent = 0;
kobj->state_initialized = 1;
}初始化kset之后,調(diào)用kset_register,將kset添加到sysfs: /**
* kset_register - initialize and add a kset.
* @k: kset.
*/
int kset_register(struct kset *k)
{
int err;
if (!k)
return -EINVAL;
kset_init(k);
err = kobject_add_internal(&k->kobj); //完成register動作,前面已說明
if (err)
return err;
kobject_uevent(&k->kobj, KOBJ_ADD); //發(fā)送ADD事件到用戶空間
return 0;
}
EXPORT_SYMBOL(kset_register);經(jīng)過kset_create, kset_init和kset_register之后,kset已初始化并添加完成。當然kset_create_and_add包含了這三個函數(shù)。 內(nèi)核對象集釋放內(nèi)核對象的釋放過程與kobject的釋放過程類似,由kset_unregister完成: /**
* kset_unregister - remove a kset.
* @k: kset.
*/
void kset_unregister(struct kset *k)
{
if (!k)
return;
kobject_del(&k->kobj); //刪除sysfs的目錄和屬性文件,前面已說明
kobject_put(&k->kobj); //與kobject釋放過程一致
}
EXPORT_SYMBOL(kset_unregister);發(fā)送事件到用戶空間由前面的代碼可以看到無論kobject或是kset,都會向用戶空間發(fā)送事件,由kobject_uevent函數(shù)通過設(shè)置環(huán)境變量的方式完成: struct kobj_uevent_env {
char *argv[3]; //user_helper使用的命令
char *envp[UEVENT_NUM_ENVP]; //環(huán)境變量數(shù)組
int envp_idx; //當前環(huán)境變量索引
char buf[UEVENT_BUFFER_SIZE]; //環(huán)境變量數(shù)據(jù)緩沖區(qū)
int buflen;
};
/**
* kobject_uevent - notify userspace by sending an uevent
*
* @action: action that is happening
* @kobj: struct kobject that the action is happening to
*
* Returns 0 if kobject_uevent() is completed with success or the
* corresponding error when it fails.
*/
int kobject_uevent(struct kobject *kobj, enum kobject_action action)
{
return kobject_uevent_env(kobj, action, NULL); //實際完成發(fā)送函數(shù)
}
EXPORT_SYMBOL_GPL(kobject_uevent);
/**
* kobject_uevent_env - send an uevent with environmental data
*
* @action: action that is happening
* @kobj: struct kobject that the action is happening to
* @envp_ext: pointer to environmental data
*
* Returns 0 if kobject_uevent_env() is completed with success or the
* corresponding error when it fails.
*/
int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
char *envp_ext[])
{
struct kobj_uevent_env *env;
const char *action_string = kobject_actions[action];
const char *devpath = NULL;
const char *subsystem;
struct kobject *top_kobj;
struct kset *kset;
const struct kset_uevent_ops *uevent_ops;
int i = 0;
int retval = 0;
#ifdef CONFIG_NET
struct uevent_sock *ue_sk;
#endif
pr_debug("kobject: '%s' (%p): %s\n",
kobject_name(kobj), kobj, __func__);
/* search the kset we belong to */
top_kobj = kobj;
while (!top_kobj->kset && top_kobj->parent) //尋找最近的kset,kset中有鍀event_ops
top_kobj = top_kobj->parent;
if (!top_kobj->kset) {
pr_debug("kobject: '%s' (%p): %s: attempted to send uevent "
"without kset!\n", kobject_name(kobj), kobj,
__func__);
return -EINVAL;
}
kset = top_kobj->kset;
uevent_ops = kset->uevent_ops; //使用kset中的uevent_ops執(zhí)行發(fā)送操作
/* skip the event, if uevent_suppress is set*/
if (kobj->uevent_suppress) { //跳過設(shè)置為uevent_suppress的kobject
pr_debug("kobject: '%s' (%p): %s: uevent_suppress "
"caused the event to drop!\n",
kobject_name(kobj), kobj, __func__);
return 0;
}
/* skip the event, if the filter returns zero. */
if (uevent_ops && uevent_ops->filter) //調(diào)用uevent_ops的filter函數(shù)
if (!uevent_ops->filter(kset, kobj)) {
pr_debug("kobject: '%s' (%p): %s: filter function "
"caused the event to drop!\n",
kobject_name(kobj), kobj, __func__);
return 0;
}
/* originating subsystem */
if (uevent_ops && uevent_ops->name) //確定發(fā)送事件的kobject名字
subsystem = uevent_ops->name(kset, kobj);
else
subsystem = kobject_name(&kset->kobj);
if (!subsystem) {
pr_debug("kobject: '%s' (%p): %s: unset subsystem caused the "
"event to drop!\n", kobject_name(kobj), kobj,
__func__);
return 0;
}
/* environment buffer */
env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); //分配kobj_uevent_env
if (!env)
return -ENOMEM;
/* complete object path */
devpath = kobject_get_path(kobj, GFP_KERNEL);
if (!devpath) {
retval = -ENOENT;
goto exit;
}
/* default keys 添加環(huán)境變量 */
retval = add_uevent_var(env, "ACTION=%s", action_string);
if (retval)
goto exit;
retval = add_uevent_var(env, "DEVPATH=%s", devpath);
if (retval)
goto exit;
retval = add_uevent_var(env, "SUBSYSTEM=%s", subsystem);
if (retval)
goto exit;
/* keys passed in from the caller */
if (envp_ext) {
for (i = 0; envp_ext[i]; i++) {
retval = add_uevent_var(env, "%s", envp_ext[i]);
if (retval)
goto exit;
}
}
/* let the kset specific function add its stuff */
if (uevent_ops && uevent_ops->uevent) { //調(diào)用uevent回調(diào)函數(shù),添加子系統(tǒng)特定的環(huán)境變量
retval = uevent_ops->uevent(kset, kobj, env);
if (retval) {
pr_debug("kobject: '%s' (%p): %s: uevent() returned "
"%d\n", kobject_name(kobj), kobj,
__func__, retval);
goto exit;
}
}
/*
* Mark "add" and "remove" events in the object to ensure proper
* events to userspace during automatic cleanup. If the object did
* send an "add" event, "remove" will automatically generated by
* the core, if not already done by the caller.
*/
if (action == KOBJ_ADD)
kobj->state_add_uevent_sent = 1;
else if (action == KOBJ_REMOVE)
kobj->state_remove_uevent_sent = 1;
mutex_lock(&uevent_sock_mutex);
/* we will send an event, so request a new sequence number */
retval = add_uevent_var(env, "SEQNUM=%llu", (unsigned long long)++uevent_seqnum);
if (retval) {
mutex_unlock(&uevent_sock_mutex);
goto exit;
}
#if defined(CONFIG_NET) //如果在編譯時指定CONFIG_NET,使用netlink發(fā)送
/* send netlink message */
list_for_each_entry(ue_sk, &uevent_sock_list, list) {
struct sock *uevent_sock = ue_sk->sk;
struct sk_buff *skb;
size_t len;
if (!netlink_has_listeners(uevent_sock, 1))
continue;
/* allocate message with the maximum possible size */
len = strlen(action_string) + strlen(devpath) + 2;
skb = alloc_skb(len + env->buflen, GFP_KERNEL);
if (skb) {
char *scratch;
/* add header */
scratch = skb_put(skb, len);
sprintf(scratch, "%s@%s", action_string, devpath);
/* copy keys to our continuous event payload buffer */
for (i = 0; i < env->envp_idx; i++) {
len = strlen(env->envp[i]) + 1;
scratch = skb_put(skb, len);
strcpy(scratch, env->envp[i]);
}
NETLINK_CB(skb).dst_group = 1;
retval = netlink_broadcast_filtered(uevent_sock, skb, //使用netlink多播發(fā)送
0, 1, GFP_KERNEL,
kobj_bcast_filter,
kobj);
/* ENOBUFS should be handled in userspace */
if (retval == -ENOBUFS || retval == -ESRCH)
retval = 0;
} else
retval = -ENOMEM;
}
#endif
mutex_unlock(&uevent_sock_mutex);
#ifdef CONFIG_UEVENT_HELPER //不能使用netlink時,使用user_helper發(fā)送
/* call uevent_helper, usually only enabled during early boot */
if (uevent_helper[0] && !kobj_usermode_filter(kobj)) {
struct subprocess_info *info;
retval = add_uevent_var(env, "HOME=/");
if (retval)
goto exit;
retval = add_uevent_var(env,
"PATH=/sbin:/bin:/usr/sbin:/usr/bin");
if (retval)
goto exit;
retval = init_uevent_argv(env, subsystem); //組裝需要調(diào)用的用戶空間命令和參數(shù)
if (retval)
goto exit;
retval = -ENOMEM;
info = call_usermodehelper_setup(env->argv[0], env->argv, //調(diào)用用戶空間程序/sbin/hotplug
env->envp, GFP_KERNEL,
NULL, cleanup_uevent_env, env);
if (info) {
retval = call_usermodehelper_exec(info, UMH_NO_WAIT);
env = NULL; /* freed by cleanup_uevent_env */
}
}
#endif
exit:
kfree(devpath);
kfree(env);
return retval;
}
EXPORT_SYMBOL_GPL(kobject_uevent_env);sysfs與內(nèi)核對象本篇文章不是以文件系統(tǒng)的角度來詳細描述sysfs,而是從內(nèi)核對象如何通過sysfs表示整個設(shè)備驅(qū)動模型為切入點,進一步理解Linux內(nèi)核對象。 內(nèi)核對象添加到sysfs在上文《內(nèi)核對象與對象集》中,將kobject添加到sysfs中,kobject_add –> kobject_add_varg –> kobject_add_internal,調(diào)用create_dir創(chuàng)建sysfs目錄和屬性文件。 static int create_dir(struct kobject *kobj)
{
const struct kobj_ns_type_operations *ops;
int error;
//調(diào)用sysfs接口創(chuàng)建kobject對應的目錄
error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj));
if (error)
return error;
error = populate_dir(kobj); //在kobject對應的目錄中生成默認屬性文件
if (error) {
sysfs_remove_dir(kobj);
return error;
}
/*
* @kobj->sd may be deleted by an ancestor going away. Hold an
* extra reference so that it stays until @kobj is gone.
*/
sysfs_get(kobj->sd);
/*
* If @kobj has ns_ops, its children need to be filtered based on
* their namespace tags. Enable namespace support on @kobj->sd.
*/
ops = kobj_child_ns_ops(kobj);
if (ops) {
BUG_ON(ops->type <= KOBJ_NS_TYPE_NONE);
BUG_ON(ops->type >= KOBJ_NS_TYPES);
BUG_ON(!kobj_ns_type_registered(ops->type));
sysfs_enable_ns(kobj->sd);
}
return 0;
}
/*
* populate_dir - populate directory with attributes.
* @kobj: object we're working on.
*
* Most subsystems have a set of default attributes that are associated
* with an object that registers with them. This is a helper called during
* object registration that loops through the default attributes of the
* subsystem and creates attributes files for them in sysfs.
*/
static int populate_dir(struct kobject *kobj)
{
struct kobj_type *t = get_ktype(kobj);
struct attribute *attr;
int error = 0;
int i;
if (t && t->default_attrs) {
for (i = 0; (attr = t->default_attrs[i]) != NULL; i++) {
error = sysfs_create_file(kobj, attr); //為每個屬性創(chuàng)建對應的文件
if (error)
break;
}
}
return error;
}create_dir通過調(diào)用sysfs_create_dir_ns創(chuàng)建sysfs中的目錄,調(diào)用sysfs_create_file創(chuàng)建屬性文件。 sysfs的核心結(jié)構(gòu)kern_node代表sysfs中每個節(jié)點。 /*
* kernfs_node - the building block of kernfs hierarchy. Each and every
* kernfs node is represented by single kernfs_node. Most fields are
* private to kernfs and shouldn't be accessed directly by kernfs users.
*
* As long as s_count reference is held, the kernfs_node itself is
* accessible. Dereferencing elem or any other outer entity requires
* active reference.
*/
struct kernfs_node {
atomic_t count; //引用計數(shù)
atomic_t active; //活動的引用計數(shù)
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
/*
* Use kernfs_get_parent() and kernfs_name/path() instead of
* accessing the following two fields directly. If the node is
* never moved to a different parent, it is safe to access the
* parent directly.
*/
struct kernfs_node *parent; //指向父節(jié)點
const char *name; //節(jié)點名稱,在sysfs顯示的名字
struct rb_node rb; //接入sysfs紅黑樹的鏈接項
const void *ns; /* namespace tag */
unsigned int hash; /* ns + name hash 紅黑樹key */
union {
struct kernfs_elem_dir dir; //該kern_node類型為目錄
struct kernfs_elem_symlink symlink; //該kern_node類型為鏈接
struct kernfs_elem_attr attr; //該kern_node類型為屬性文件
};
void *priv;
unsigned short flags; //標記位,目錄、鏈接、屬性文件或是否已被刪除
umode_t mode; //訪問權(quán)限,在sysfs中該kern_node的權(quán)限
unsigned int ino; //唯一編號
struct kernfs_iattrs *iattr; //用于設(shè)置非默認的inode屬性,如果沒有則置為NULL
};在sysfs中創(chuàng)建目錄sysfs_create_dir_ns/**
* sysfs_create_dir_ns - create a directory for an object with a namespace tag
* @kobj: object we're creating directory for
* @ns: the namespace tag to use
*/
int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
{
struct kernfs_node *parent, *kn;
BUG_ON(!kobj);
if (kobj->parent)
parent = kobj->parent->sd; //如果kobject設(shè)置parent,則使用之
else
parent = sysfs_root_kn; //否則parent就設(shè)置為sysfs根目錄
if (!parent)
return -ENOENT;
//創(chuàng)建目錄
kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns);
if (IS_ERR(kn)) {
if (PTR_ERR(kn) == -EEXIST)
sysfs_warn_dup(parent, kobject_name(kobj));
return PTR_ERR(kn);
}
kobj->sd = kn;
return 0;
}
/**
* kernfs_create_dir_ns - create a directory
* @parent: parent in which to create a new directory
* @name: name of the new directory
* @mode: mode of the new directory
* @priv: opaque data associated with the new directory
* @ns: optional namespace tag of the directory
*
* Returns the created node on success, ERR_PTR() value on failure.
*/
struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
const char *name, umode_t mode,
void *priv, const void *ns)
{
struct kernfs_node *kn;
int rc;
/* allocate 分配空間并初始化, KERNFS_DIR指定創(chuàng)建目錄 */
kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR);
if (!kn)
return ERR_PTR(-ENOMEM);
kn->dir.root = parent->dir.root; //指向根目錄kern_node
kn->ns = ns; //指定命名空間
kn->priv = priv;
/* link in */
rc = kernfs_add_one(kn); //將kern_node加入父目錄的紅黑樹中
if (!rc)
return kn;
kernfs_put(kn);
return ERR_PTR(rc);
}kernfs_create_dir_ns函數(shù)中的兩個主要函數(shù)kernfs_new_node和kernfs_add_one,在創(chuàng)建文件和創(chuàng)建符號鏈接同樣使用,僅是參數(shù)不同。 為kern_node結(jié)構(gòu)分配空間,并初始化 struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
const char *name, umode_t mode,
unsigned flags)
{
struct kernfs_node *kn;
//分配kern_node空間,并初始化
kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags);
if (kn) {
kernfs_get(parent);
kn->parent = parent;
}
return kn;
}
static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
const char *name, umode_t mode,
unsigned flags)
{
struct kernfs_node *kn;
int ret;
name = kstrdup_const(name, GFP_KERNEL); //復制常量字符串
if (!name)
return NULL;
kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); //在緩存空間分配kernfs_node
if (!kn)
goto err_out1;
/*
* If the ino of the sysfs entry created for a kmem cache gets
* allocated from an ida layer, which is accounted to the memcg that
* owns the cache, the memcg will get pinned forever. So do not account
* ino ida allocations.
*/
ret = ida_simple_get(&root->ino_ida, 1, 0, //獲取唯一標號,用于唯一標示kern_node
GFP_KERNEL | __GFP_NOACCOUNT);
if (ret < 0)
goto err_out2;
kn->ino = ret;
atomic_set(&kn->count, 1); //更新引用計數(shù)
atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
RB_CLEAR_NODE(&kn->rb);
//設(shè)置kern_node相關(guān)域
kn->name = name;
kn->mode = mode;
kn->flags = flags;
return kn;
err_out2:
kmem_cache_free(kernfs_node_cache, kn);
err_out1:
kfree_const(name);
return NULL;
}將kern_node添加到parent的紅黑樹中: /**
* kernfs_add_one - add kernfs_node to parent without warning
* @kn: kernfs_node to be added
*
* The caller must already have initialized @kn->parent. This
* function increments nlink of the parent's inode if @kn is a
* directory and link into the children list of the parent.
*
* RETURNS:
* 0 on success, -EEXIST if entry with the given name already
* exists.
*/
int kernfs_add_one(struct kernfs_node *kn)
{
struct kernfs_node *parent = kn->parent;
struct kernfs_iattrs *ps_iattr;
bool has_ns;
int ret;
mutex_lock(&kernfs_mutex);
ret = -EINVAL;
has_ns = kernfs_ns_enabled(parent);
if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
has_ns ? "required" : "invalid", parent->name, kn->name))
goto out_unlock;
if (kernfs_type(parent) != KERNFS_DIR) //檢查parent是否為目錄
goto out_unlock;
ret = -ENOENT;
if (parent->flags & KERNFS_EMPTY_DIR) //檢查parent是否為空目錄
goto out_unlock;
//檢查parent是否是active狀態(tài)
if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
goto out_unlock;
kn->hash = kernfs_name_hash(kn->name, kn->ns); //作為紅黑樹比較的key
ret = kernfs_link_sibling(kn); //kern_node鏈入parent節(jié)點紅黑樹中
if (ret)
goto out_unlock;
/* Update timestamps on the parent */
ps_iattr = parent->iattr;
if (ps_iattr) {
struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
}
mutex_unlock(&kernfs_mutex);
/*
* Activate the new node unless CREATE_DEACTIVATED is requested.
* If not activated here, the kernfs user is responsible for
* activating the node with kernfs_activate(). A node which hasn't
* been activated is not visible to userland and its removal won't
* trigger deactivation.
*/
if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
kernfs_activate(kn);
return 0;
out_unlock:
mutex_unlock(&kernfs_mutex);
return ret;
}sysfs紅黑樹中的key: /**
* kernfs_name_hash
* @name: Null terminated string to hash
* @ns: Namespace tag to hash
*
* Returns 31 bit hash of ns + name (so it fits in an off_t )
*/
static unsigned int kernfs_name_hash(const char *name, const void *ns)
{
unsigned long hash = init_name_hash();
unsigned int len = strlen(name);
while (len--)
hash = partial_name_hash(*name++, hash);
hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
hash &= 0x7fffffffU;
/* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
if (hash < 2)
hash += 2;
if (hash >= INT_MAX)
hash = INT_MAX - 1;
return hash;
}
static int kernfs_name_compare(unsigned int hash, const char *name,
const void *ns, const struct kernfs_node *kn)
{
if (hash < kn->hash)
return -1;
if (hash > kn->hash)
return 1;
if (ns < kn->ns)
return -1;
if (ns > kn->ns)
return 1;
return strcmp(name, kn->name);
}
kern_node鏈入parent節(jié)點紅黑樹中: /**
* kernfs_link_sibling - link kernfs_node into sibling rbtree
* @kn: kernfs_node of interest
*
* Link @kn into its sibling rbtree which starts from
* @kn->parent->dir.children.
*
* Locking:
* mutex_lock(kernfs_mutex)
*
* RETURNS:
* 0 on susccess -EEXIST on failure.
*/
static int kernfs_link_sibling(struct kernfs_node *kn)
{
struct rb_node **node = &kn->parent->dir.children.rb_node; //parent目錄的紅黑樹
struct rb_node *parent = NULL;
while (*node) { //在parent的目錄中,尋找合適的位置將kn插入parent的紅黑樹中
struct kernfs_node *pos;
int result;
pos = rb_to_kn(*node);
parent = *node;
result = kernfs_sd_compare(kn, pos); //優(yōu)先順序: hash > ns > name
if (result < 0)
node = &pos->rb.rb_left;
else if (result > 0)
node = &pos->rb.rb_right;
else
return -EEXIST;
}
/* add new node and rebalance the tree */
rb_link_node(&kn->rb, parent, node);
rb_insert_color(&kn->rb, &kn->parent->dir.children);
/* successfully added, account subdir number */
if (kernfs_type(kn) == KERNFS_DIR)
kn->parent->dir.subdirs++;
return 0;
}在sysfs中創(chuàng)建文件static inline int __must_check sysfs_create_file(struct kobject *kobj,
const struct attribute *attr)
{
return sysfs_create_file_ns(kobj, attr, NULL);
}
/**
* sysfs_create_file_ns - create an attribute file for an object with custom ns
* @kobj: object we're creating for
* @attr: attribute descriptor
* @ns: namespace the new file should belong to
*/
int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
const void *ns)
{
BUG_ON(!kobj || !kobj->sd || !attr);
return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns);
}
EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
int sysfs_add_file_mode_ns(struct kernfs_node *parent,
const struct attribute *attr, bool is_bin,
umode_t mode, const void *ns)
{
struct lock_class_key *key = NULL;
const struct kernfs_ops *ops;
struct kernfs_node *kn;
loff_t size;
if (!is_bin) {
struct kobject *kobj = parent->priv;
const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
/* every kobject with an attribute needs a ktype assigned */
if (WARN(!sysfs_ops, KERN_ERR
"missing sysfs attribute operations for kobject: %s\n",
kobject_name(kobj)))
return -EINVAL;
//確定讀寫的操作函數(shù)
if (sysfs_ops->show && sysfs_ops->store) {
if (mode & SYSFS_PREALLOC)
ops = &sysfs_prealloc_kfops_rw;
else
ops = &sysfs_file_kfops_rw;
} else if (sysfs_ops->show) {
if (mode & SYSFS_PREALLOC)
ops = &sysfs_prealloc_kfops_ro;
else
ops = &sysfs_file_kfops_ro;
} else if (sysfs_ops->store) {
if (mode & SYSFS_PREALLOC)
ops = &sysfs_prealloc_kfops_wo;
else
ops = &sysfs_file_kfops_wo;
} else
ops = &sysfs_file_kfops_empty;
size = PAGE_SIZE;
} else {
struct bin_attribute *battr = (void *)attr;
if (battr->mmap)
ops = &sysfs_bin_kfops_mmap;
else if (battr->read && battr->write)
ops = &sysfs_bin_kfops_rw;
else if (battr->read)
ops = &sysfs_bin_kfops_ro;
else if (battr->write)
ops = &sysfs_bin_kfops_wo;
else
ops = &sysfs_file_kfops_empty;
size = battr->size;
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
if (!attr->ignore_lockdep)
key = attr->key ?: (struct lock_class_key *)&attr->skey;
#endif
kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops,
(void *)attr, ns, key); //創(chuàng)建屬性文件
if (IS_ERR(kn)) {
if (PTR_ERR(kn) == -EEXIST)
sysfs_warn_dup(parent, attr->name);
return PTR_ERR(kn);
}
return 0;
}通過上面的代碼跟蹤,創(chuàng)建屬性文件由__kernfs_create_file實現(xiàn),最終仍然是調(diào)用kernfs_new_node和kernfs_add_one。 /**
* __kernfs_create_file - kernfs internal function to create a file
* @parent: directory to create the file in
* @name: name of the file
* @mode: mode of the file
* @size: size of the file
* @ops: kernfs operations for the file
* @priv: private data for the file
* @ns: optional namespace tag of the file
* @key: lockdep key for the file's active_ref, %NULL to disable lockdep
*
* Returns the created node on success, ERR_PTR() value on error.
*/
struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
const char *name,
umode_t mode, loff_t size,
const struct kernfs_ops *ops,
void *priv, const void *ns,
struct lock_class_key *key)
{
struct kernfs_node *kn;
unsigned flags;
int rc;
flags = KERNFS_FILE; //創(chuàng)建的kern_node類型為file
//分配空間并初始化
kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
if (!kn)
return ERR_PTR(-ENOMEM);
kn->attr.ops = ops;
kn->attr.size = size;
kn->ns = ns;
kn->priv = priv;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
if (key) {
lockdep_init_map(&kn->dep_map, "s_active", key, 0);
kn->flags |= KERNFS_LOCKDEP;
}
#endif
/*
* kn->attr.ops is accesible only while holding active ref. We
* need to know whether some ops are implemented outside active
* ref. Cache their existence in flags.
*/
if (ops->seq_show)
kn->flags |= KERNFS_HAS_SEQ_SHOW;
if (ops->mmap)
kn->flags |= KERNFS_HAS_MMAP;
rc = kernfs_add_one(kn); //將kern_node添加到parent的紅黑樹中
if (rc) {
kernfs_put(kn);
return ERR_PTR(rc);
}
return kn;
}在sysfs_add_file_mode_ns函數(shù)中根據(jù)flags的不同,注冊不同的讀寫回調(diào)函數(shù),下面以sysfs_prealloc_kfops_rw函數(shù)為例,其他結(jié)構(gòu)類似,不贅述。 //常規(guī)文件--sysfs_prealloc_kfops_rw
static const struct kernfs_ops sysfs_prealloc_kfops_rw = {
.read = sysfs_kf_read,
.write = sysfs_kf_write,
.prealloc = true,
};
/* kernfs read callback for regular sysfs files with pre-alloc */
static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
size_t count, loff_t pos)
{
const struct sysfs_ops *ops = sysfs_file_ops(of->kn); //獲取kobject中的sysfs_ops操作表
struct kobject *kobj = of->kn->parent->priv;
size_t len;
/*
* If buf != of->prealloc_buf, we don't know how
* large it is, so cannot safely pass it to ->show
*/
if (pos || WARN_ON_ONCE(buf != of->prealloc_buf))
return 0;
len = ops->show(kobj, of->kn->priv, buf); //kobject中sd域的sysfs_ops操作表中的show
return min(count, len);
}
/* kernfs write callback for regular sysfs files */
static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
size_t count, loff_t pos)
{ //獲取kobject中的sysfs_ops操作表
const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
struct kobject *kobj = of->kn->parent->priv;
if (!count)
return 0;
return ops->store(kobj, of->kn->priv, buf, count); //kobject中sd域的sysfs_ops操作表中的store
}關(guān)于屬性文件的讀寫操作,最終都回調(diào)到kobject中的sd域的sysfs_ops操作表,這個操作表示在kobject_init函數(shù)中設(shè)置。回顧kobject_create函數(shù): struct kobject *kobject_create(void)
{
struct kobject *kobj;
kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); //分配空間
if (!kobj)
return NULL;
kobject_init(kobj, &dynamic_kobj_ktype); //初始化, kobj_type類型為dynamic_kobj_ktype
return kobj;
}
//注冊如下結(jié)構(gòu)
static struct kobj_type dynamic_kobj_ktype = {
.release = dynamic_kobj_release,
.sysfs_ops = &kobj_sysfs_ops,
};
const struct sysfs_ops kobj_sysfs_ops = {
.show = kobj_attr_show,
.store = kobj_attr_store,
};
EXPORT_SYMBOL_GPL(kobj_sysfs_ops);kobject的sysfs的show和store方法為:kobj_attr_show和kobj_attr_store static ssize_t kobj_attr_show(struct kobject *kobj, struct attribute *attr,
char *buf)
{
struct kobj_attribute *kattr;
ssize_t ret = -EIO;
kattr = container_of(attr, struct kobj_attribute, attr);
if (kattr->show) //如果業(yè)務(wù)子系統(tǒng)設(shè)置了show函數(shù),則調(diào)用
ret = kattr->show(kobj, kattr, buf);
return ret;
}
static ssize_t kobj_attr_store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t count)
{
struct kobj_attribute *kattr;
ssize_t ret = -EIO;
kattr = container_of(attr, struct kobj_attribute, attr);
if (kattr->store) //如果業(yè)務(wù)子系統(tǒng)設(shè)置了store函數(shù),則調(diào)用
ret = kattr->store(kobj, kattr, buf, count);
return ret;
}真正的對屬性文件進行讀寫的回調(diào)由業(yè)務(wù)子系統(tǒng)實現(xiàn)。 在sysfs中創(chuàng)建符號鏈接/**
* sysfs_create_link - create symlink between two objects.
* @kobj: object whose directory we're creating the link in.
* @target: object we're pointing to.
* @name: name of the symlink.
*/
int sysfs_create_link(struct kobject *kobj, struct kobject *target,
const char *name)
{
return sysfs_do_create_link(kobj, target, name, 1);
}
EXPORT_SYMBOL_GPL(sysfs_create_link);
static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
const char *name, int warn)
{
struct kernfs_node *parent = NULL;
if (!kobj)
parent = sysfs_root_kn;
else
parent = kobj->sd;
if (!parent)
return -EFAULT;
return sysfs_do_create_link_sd(parent, target, name, warn);
}
static int sysfs_do_create_link_sd(struct kernfs_node *parent,
struct kobject *target_kobj,
const char *name, int warn)
{
struct kernfs_node *kn, *target = NULL;
BUG_ON(!name || !parent);
/*
* We don't own @target_kobj and it may be removed at any time.
* Synchronize using sysfs_symlink_target_lock. See
* sysfs_remove_dir() for details.
*/
spin_lock(&sysfs_symlink_target_lock);
if (target_kobj->sd) {
target = target_kobj->sd;
kernfs_get(target);
}
spin_unlock(&sysfs_symlink_target_lock);
if (!target)
return -ENOENT;
kn = kernfs_create_link(parent, name, target); //創(chuàng)建sysfs符號鏈接
kernfs_put(target);
if (!IS_ERR(kn))
return 0;
if (warn && PTR_ERR(kn) == -EEXIST)
sysfs_warn_dup(parent, name);
return PTR_ERR(kn);
}由上面的代碼追蹤,創(chuàng)建符號鏈接由kernfs_create_link函數(shù)上。 /**
* kernfs_create_link - create a symlink
* @parent: directory to create the symlink in
* @name: name of the symlink
* @target: target node for the symlink to point to
*
* Returns the created node on success, ERR_PTR() value on error.
*/
struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
const char *name,
struct kernfs_node *target)
{
struct kernfs_node *kn;
int error;
//指定創(chuàng)建符號鏈接
kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK);
if (!kn)
return ERR_PTR(-ENOMEM);
if (kernfs_ns_enabled(parent))
kn->ns = target->ns;
kn->symlink.target_kn = target;
kernfs_get(target); /* ref owned by symlink */
error = kernfs_add_one(kn); //將kern_node添加到parent的紅黑樹中
if (!error)
return kn;
kernfs_put(kn);
return ERR_PTR(error);
}與創(chuàng)建目錄和文件類似,最終仍然是調(diào)用kernfs_new_node和kernfs_add_one實現(xiàn)。 基于內(nèi)核對象編程套路目標:在sysfs中創(chuàng)建一個目錄/sys/kernel/storage/,在該目錄下,還創(chuàng)建了一個文件value。value可以寫入整型數(shù)據(jù),隨后可以讀出。 struct storage_obj {
struct kobject kobj;
int val; //用于保存寫入的數(shù)據(jù)
};
struct storage_attribute {
struct attribute *attr;
ssize_t (*show)(struct kobject *, struct attribute *, char *);
ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t);
}
//定義并初始化storage_attribute
struct storage_attribute *sattr = &struct storage_attribute {
.attr = {.name = "value", .mode = 0666},
.show = storage_show,
.store = storage_store,
};
ssize_t storage_show(struct kobject *kobj, struct attribute *attr, char *buf)
{
struct storage *stor = container_of(kobj, struct storage_obj, kobj);
stor->val = atoi(buf);
}
ssize_t storage_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t s) {
struct storage *stor = container_of(kobj, struct storage_obj, kobj);
memcpy(buf, s, itoa(stor->val));
}
void storage_release(struct kobject *kobj)
{
......
}
struct storage_ktype {
struct kobj_type *ktype;
}
int storage_create_file(struct storage_obj *sobj, const struct storage_attribute *attr)
{
int error = 0;
if (sobj) {
error = sysfs_create_file(&sobj->kobj, &attr->attr);
}
return error;
}
void storage_remove_file(struct storage_obj *sobj, const struct storage_attribute *attr)
{
if (sobj) {
sysfs_remove_file(&sobj->kobj, &attr->attr);
}
}
struct storage_obj * create_storage_obj()
{
struct storage_obj *sobj = (struct storage_obj *)malloc(struct storage_obj);
struct storage_ktype *stype = (struct storage_ktype *)malloc(struct storage_ktype);
sobj->parent = kernel_kobj;
kobject_init_and_add(&sobj->kobj, &stype->ktype);
return sobj
}
void destroy_storage_obj(struct kobject *kobj) {
struct storage_obj *sobj = container_of(kobj, struct storage_obj, kobj);
kobject_del(kboj);
free(sobj);
free(stype);
}
設(shè)備驅(qū)動模型概述Linux的設(shè)備驅(qū)動模型能夠帶來以下的優(yōu)點: 內(nèi)核基于內(nèi)核對象和sysfs,通過抽象以下五種概念,實現(xiàn)了設(shè)備驅(qū)動模型的框架,使得編寫子系統(tǒng)成為“八股文”。 在Linux內(nèi)核中,子系統(tǒng)是由bus_type, device, driver, class和class_interface之間的關(guān)系所描述,而設(shè)備驅(qū)動模型正是這些關(guān)系的核心實現(xiàn),使得在編寫子系統(tǒng)程序時,只要遵循設(shè)備模型的套路,便不需要關(guān)注于這些復雜的關(guān)系,只需實現(xiàn)自身的業(yè)務(wù)邏輯。
總線類型總線類型的數(shù)據(jù)結(jié)構(gòu)struct bus_type {
const char *name; //子系統(tǒng)名稱
const char *dev_name; //供子系統(tǒng)生成設(shè)備名稱使用
struct device *dev_root;
struct device_attribute *dev_attrs; /* use dev_groups instead */
const struct attribute_group **bus_groups; //總線類型使用的屬性組
const struct attribute_group **dev_groups; //設(shè)備使用的屬性組
const struct attribute_group **drv_groups; //驅(qū)動使用的屬性組
int (*match)(struct device *dev, struct device_driver *drv); //檢測設(shè)備與驅(qū)動是否可以綁定
int (*uevent)(struct device *dev, struct kobj_uevent_env *env); //發(fā)送事件前,設(shè)置bus特有的環(huán)境變量
int (*probe)(struct device *dev); //當設(shè)備可以綁定到驅(qū)動時,對設(shè)備進行初始化和執(zhí)行綁定
int (*remove)(struct device *dev); //當設(shè)備從驅(qū)動中解綁時,回調(diào)
void (*shutdown)(struct device *dev); //當設(shè)備斷電時,回調(diào)
int (*online)(struct device *dev); //當設(shè)備上電時,回調(diào)
int (*offline)(struct device *dev); //當設(shè)備下電時,回調(diào)
int (*suspend)(struct device *dev, pm_message_t state); //當設(shè)備進入節(jié)能狀態(tài)時,回調(diào)
int (*resume)(struct device *dev); //當設(shè)備恢復正常狀態(tài)時,回調(diào)
const struct dev_pm_ops *pm; //電源管理相關(guān)
const struct iommu_ops *iommu_ops;
struct subsys_private *p; //子系統(tǒng)私有類型
struct lock_class_key lock_key;
};
struct subsys_private {
struct kset subsys; //總線kset,scsi子系統(tǒng)對應/sys/bus/scsi
struct kset *devices_kset; //設(shè)備kset, scsi子系統(tǒng)對應/sys/bus/scsi/devices
struct list_head interfaces; //總線的接口鏈表
struct mutex mutex;
struct kset *drivers_kset; //驅(qū)動kset, scsi子系統(tǒng)對應/sys/bus/scsi/drivers
struct klist klist_devices; //總線的設(shè)備鏈表
struct klist klist_drivers; //總線的驅(qū)動鏈表
struct blocking_notifier_head bus_notifier; //子系統(tǒng)變化時,需要通知的鏈表
unsigned int drivers_autoprobe:1; //是否允許設(shè)備或驅(qū)動加載時,自動探測
struct bus_type *bus; //指向總線類型
struct kset glue_dirs;
struct class *class; //指向總線類型的類
};從上面的兩個結(jié)構(gòu)可以看到,bus_type包含的主要是實現(xiàn)子系統(tǒng)應該具體關(guān)注的比如name,一組回調(diào)函數(shù)。而subsys_private結(jié)構(gòu)主要是設(shè)備驅(qū)動模型中的關(guān)系的表達,如字段subsys的類型是kset,描述該子系統(tǒng)在sysfs中的表達;klist_devices和klist_drivers分別是設(shè)備鏈表和驅(qū)動鏈表,用于管理總線類型的所有設(shè)備和驅(qū)動。之后仍然會遇到xxx_private的結(jié)構(gòu),以這種方式命名的結(jié)構(gòu),都是給設(shè)備驅(qū)動模型核心使用的,業(yè)務(wù)子系統(tǒng)無需也不能使用。 總線類型注冊/反注冊實現(xiàn)子系統(tǒng)的第一步就是創(chuàng)建bus_type,并將其注冊到系統(tǒng),此時需要調(diào)用bus_register: /**
* bus_register - register a driver-core subsystem
* @bus: bus to register
*
* Once we have that, we register the bus with the kobject
* infrastructure, then register the children subsystems it has:
* the devices and drivers that belong to the subsystem.
*/
int bus_register(struct bus_type *bus)
{
int retval;
struct subsys_private *priv;
struct lock_class_key *key = &bus->lock_key;
//分配總線類型私有數(shù)據(jù)空間
priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL);
if (!priv)
return -ENOMEM;
priv->bus = bus; //關(guān)聯(lián)bus_type和subsys_private
bus->p = priv;
BLOCKING_INIT_NOTIFIER_HEAD(&priv->bus_notifier);
//設(shè)置總線類型名稱到kobject中,在sysfs中顯示
retval = kobject_set_name(&priv->subsys.kobj, "%s", bus->name);
if (retval)
goto out;
priv->subsys.kobj.kset = bus_kset;
priv->subsys.kobj.ktype = &bus_ktype;
priv->drivers_autoprobe = 1; //開啟自動探測
retval = kset_register(&priv->subsys); //將總線類型添加到設(shè)備模型中
if (retval)
goto out;
retval = bus_create_file(bus, &bus_attr_uevent); //創(chuàng)建uevent屬性文件
if (retval)
goto bus_uevent_fail;
priv->devices_kset = kset_create_and_add("devices", NULL, //創(chuàng)建devices目錄
&priv->subsys.kobj);
if (!priv->devices_kset) {
retval = -ENOMEM;
goto bus_devices_fail;
}
priv->drivers_kset = kset_create_and_add("drivers", NULL, //創(chuàng)建drivers目錄
&priv->subsys.kobj);
if (!priv->drivers_kset) {
retval = -ENOMEM;
goto bus_drivers_fail;
}
//初始化鏈表和鎖
INIT_LIST_HEAD(&priv->interfaces);
__mutex_init(&priv->mutex, "subsys mutex", key);
klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put);
klist_init(&priv->klist_drivers, NULL, NULL);
retval = add_probe_files(bus); //在sysfs中添加探測文件drivers_autoprobe和drivers_probe
if (retval)
goto bus_probe_files_fail;
retval = bus_add_groups(bus, bus->bus_groups); //添加總線類型的屬性文件
if (retval)
goto bus_groups_fail;
pr_debug("bus: '%s': registered\n", bus->name);
return 0;
//失敗回滾操作
bus_groups_fail:
remove_probe_files(bus);
bus_probe_files_fail:
kset_unregister(bus->p->drivers_kset);
bus_drivers_fail:
kset_unregister(bus->p->devices_kset);
bus_devices_fail:
bus_remove_file(bus, &bus_attr_uevent);
bus_uevent_fail:
kset_unregister(&bus->p->subsys);
out:
kfree(bus->p);
bus->p = NULL;
return retval;
}
EXPORT_SYMBOL_GPL(bus_register);注冊總線類型后,便可以在系統(tǒng)看到: root@ubuntu16:~# ls /sys/bus/scsi -l total 0 drwxr-xr-x 2 root root 0 Sep 5 16:01 devices drwxr-xr-x 4 root root 0 Sep 2 09:44 drivers -rw-r--r-- 1 root root 4096 Sep 5 11:29 drivers_autoprobe --w------- 1 root root 4096 Sep 5 11:29 drivers_probe --w------- 1 root root 4096 Sep 2 09:44 uevent root@ubuntu16:~# 當從系統(tǒng)中注銷子系統(tǒng)時,需要調(diào)用bus_unregister,完成總線類型的反注冊: /**
* bus_unregister - remove a bus from the system
* @bus: bus.
*
* Unregister the child subsystems and the bus itself.
* Finally, we call bus_put() to release the refcount
*/
void bus_unregister(struct bus_type *bus)
{
pr_debug("bus: '%s': unregistering\n", bus->name);
if (bus->dev_root)
device_unregister(bus->dev_root); //刪除根設(shè)備
bus_remove_groups(bus, bus->bus_groups); //刪除總線的屬性文件
remove_probe_files(bus); //刪除探測文件drivers_autoprobe和drivers_probe
kset_unregister(bus->p->drivers_kset); //刪除drivers目錄
kset_unregister(bus->p->devices_kset); //刪除devices目錄
bus_remove_file(bus, &bus_attr_uevent); //刪除uevent文件
kset_unregister(&bus->p->subsys); //刪除總線目錄
}
EXPORT_SYMBOL_GPL(bus_unregister);設(shè)備設(shè)備的數(shù)據(jù)結(jié)構(gòu)struct device {
struct device *parent; //指向父設(shè)備,eg.HBA
struct device_private *p; //設(shè)備私有指針
struct kobject kobj; //內(nèi)嵌kobject
const char *init_name; /* initial name of the device */
const struct device_type *type; //設(shè)備類型,抽象出來的域和方法
struct mutex mutex; /* mutex to synchronize calls to its driver */
struct bus_type *bus; /* type of bus device is on; devive歸屬的bus */
struct device_driver *driver; /* which driver has allocated this device */
void *platform_data; /* Platform specific data, device core doesn't touch it */
void *driver_data; /* Driver data, set and get with dev_set/get_drvdata */
struct dev_pm_info power;
struct dev_pm_domain *pm_domain;
#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
struct irq_domain *msi_domain;
#endif
#ifdef CONFIG_PINCTRL
struct dev_pin_info *pins;
#endif
#ifdef CONFIG_GENERIC_MSI_IRQ
struct list_head msi_list;
#endif
#ifdef CONFIG_NUMA
int numa_node; /* NUMA node this device is close to */
#endif
u64 *dma_mask; /* dma mask (if dma'able device) */
u64 coherent_dma_mask;/* Like dma_mask, but for
alloc_coherent mappings as
not all hardware supports
64 bit addresses for consistent
allocations such descriptors. */
unsigned long dma_pfn_offset;
struct device_dma_parameters *dma_parms;
struct list_head dma_pools; /* dma pools (if dma'ble) */
struct dma_coherent_mem *dma_mem; /* internal for coherent mem override */
#ifdef CONFIG_DMA_CMA
struct cma *cma_area; /* contiguous memory area for dma allocations */
#endif
/* arch specific additions */
struct dev_archdata archdata;
struct device_node *of_node; /* associated device tree node */
struct fwnode_handle *fwnode; /* firmware device node */
dev_t devt; /* dev_t, creates the sysfs "dev"; 設(shè)備號 */
u32 id; /* device instance */
spinlock_t devres_lock;
struct list_head devres_head; //設(shè)備資源鏈表頭
struct klist_node knode_class; //鏈入類的設(shè)備鏈表
struct class *class; //指向鏈入的類
const struct attribute_group **groups; /* optional groups 設(shè)備特有的屬性 */
void (*release)(struct device *dev); //設(shè)備是否回調(diào)
struct iommu_group *iommu_group;
bool offline_disabled:1;
bool offline:1;
};
struct device_private {
struct klist klist_children; //子設(shè)備鏈表
struct klist_node knode_parent; //鏈入父設(shè)備的children鏈表
struct klist_node knode_driver; //鏈入驅(qū)動的設(shè)備鏈表中
struct klist_node knode_bus; //鏈入總線的設(shè)備鏈表
struct list_head deferred_probe; //鏈入延遲探測鏈表
struct device *device; //指向關(guān)聯(lián)的device
};
struct device_type {
const char *name; //設(shè)備類型的名稱
const struct attribute_group **groups; //設(shè)備的公有屬性組
int (*uevent)(struct device *dev, struct kobj_uevent_env *env); //發(fā)送事件前調(diào)用,用于設(shè)置事件環(huán)境變量
char *(*devnode)(struct device *dev, umode_t *mode, //在創(chuàng)建設(shè)備時,提供名字線索
kuid_t *uid, kgid_t *gid);
void (*release)(struct device *dev); //設(shè)備釋放時回調(diào)
const struct dev_pm_ops *pm;
};在設(shè)備驅(qū)動模型中,device結(jié)構(gòu)有bus域,指向device所屬的總線類型;class域指向device所屬的唯一的類;driver域指向設(shè)備所綁定的驅(qū)動。與內(nèi)核對象一樣,設(shè)備也被組織層層次結(jié)構(gòu),通過parent指向父設(shè)備。 device_private結(jié)構(gòu)由設(shè)備驅(qū)動模型處理,維護和其他結(jié)構(gòu)之間的內(nèi)部關(guān)系。device_type結(jié)構(gòu)定義設(shè)備公有的屬性和方法。 設(shè)備的注冊與反注冊當設(shè)備被發(fā)現(xiàn)后,需要將設(shè)備注冊到系統(tǒng),需要調(diào)用device_register函數(shù): /**
* device_register - register a device with the system.
* @dev: pointer to the device structure
*
* This happens in two clean steps - initialize the device
* and add it to the system. The two steps can be called
* separately, but this is the easiest and most common.
* I.e. you should only call the two helpers separately if
* have a clearly defined need to use and refcount the device
* before it is added to the hierarchy.
*
* For more information, see the kerneldoc for device_initialize()
* and device_add().
*
* NOTE: _Never_ directly free @dev after calling this function, even
* if it returned an error! Always use put_device() to give up the
* reference initialized in this function instead.
*/
int device_register(struct device *dev)
{
device_initialize(dev); //初始化device結(jié)構(gòu)
return device_add(dev); //將設(shè)備添加到系統(tǒng)
}
EXPORT_SYMBOL_GPL(device_register);
void device_initialize(struct device *dev)
{
dev->kobj.kset = devices_kset; // /sys/devices/
kobject_init(&dev->kobj, &device_ktype); // device的類型為device_ktype
INIT_LIST_HEAD(&dev->dma_pools);
mutex_init(&dev->mutex);
lockdep_set_novalidate_class(&dev->mutex);
spin_lock_init(&dev->devres_lock);
INIT_LIST_HEAD(&dev->devres_head);
device_pm_init(dev);
set_dev_node(dev, -1);
#ifdef CONFIG_GENERIC_MSI_IRQ
INIT_LIST_HEAD(&dev->msi_list);
#endif
}
EXPORT_SYMBOL_GPL(device_initialize);device_register函數(shù)調(diào)用device_initialize對device結(jié)構(gòu)進行初始化,調(diào)用device_add函數(shù)完成設(shè)備添加到系統(tǒng)。 int device_add(struct device *dev)
{
struct device *parent = NULL;
struct kobject *kobj;
struct class_interface *class_intf;
int error = -EINVAL;
dev = get_device(dev);
if (!dev)
goto done;
if (!dev->p) { //如果device沒有設(shè)置devcie_private,在這里分配并初始化
error = device_private_init(dev);
if (error)
goto done;
}
/*
* for statically allocated devices, which should all be converted
* some day, we need to initialize the name. We prevent reading back
* the name, and force the use of dev_name()
*/
if (dev->init_name) {
dev_set_name(dev, "%s", dev->init_name); //設(shè)置device的kobject名字
dev->init_name = NULL;
}
/* subsystems can specify simple device enumeration */
if (!dev_name(dev) && dev->bus && dev->bus->dev_name) //如果device沒有設(shè)置init_name, 則使用bus的dev_name和設(shè)備id生成
dev_set_name(dev, "%s%u", dev->bus->dev_name, dev->id);
if (!dev_name(dev)) {
error = -EINVAL;
goto name_error;
}
pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
parent = get_device(dev->parent);
kobj = get_device_parent(dev, parent);
if (kobj)
dev->kobj.parent = kobj; //設(shè)置device的kobject的parent字段
/* use parent numa_node */
if (parent && (dev_to_node(dev) == NUMA_NO_NODE))
set_dev_node(dev, dev_to_node(parent));
/* first, register with generic layer. */
/* we require the name to be set before, and pass NULL */
error = kobject_add(&dev->kobj, dev->kobj.parent, NULL); //將device添加到parent的目錄中
if (error)
goto Error;
/* notify platform of device entry */
if (platform_notify)
platform_notify(dev);
error = device_create_file(dev, &dev_attr_uevent); //在設(shè)備目錄下創(chuàng)建uevent文件
if (error)
goto attrError;
error = device_add_class_symlinks(dev); //為設(shè)備創(chuàng)建和類相關(guān)的符號鏈接
if (error)
goto SymlinkError;
error = device_add_attrs(dev); //為設(shè)備的默認屬性添加對應的文件
if (error)
goto AttrsError;
error = bus_add_device(dev); //將device添加到bus_type
if (error)
goto BusError;
error = dpm_sysfs_add(dev);
if (error)
goto DPMError;
device_pm_add(dev);
if (MAJOR(dev->devt)) {
error = device_create_file(dev, &dev_attr_dev); //在設(shè)備目錄下創(chuàng)建dev屬性對應文件,用于保存設(shè)備號
if (error)
goto DevAttrError;
error = device_create_sys_dev_entry(dev); //在/sys/block和/sys/char創(chuàng)建一個到設(shè)備所在目錄的符號鏈接
if (error)
goto SysEntryError;
devtmpfs_create_node(dev); //在/dev下創(chuàng)建設(shè)備文件
}
/* Notify clients of device addition. This call must come
* after dpm_sysfs_add() and before kobject_uevent().
*/
if (dev->bus)
blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
BUS_NOTIFY_ADD_DEVICE, dev);
kobject_uevent(&dev->kobj, KOBJ_ADD); //發(fā)送設(shè)備ADD事件
bus_probe_device(dev); //嘗試將device綁定到device_driver
if (parent) //如果指定了parent,將設(shè)備添加到parent的孩子鏈表中
klist_add_tail(&dev->p->knode_parent,
&parent->p->klist_children);
if (dev->class) { //如果設(shè)置了class,將設(shè)備添加到類的設(shè)備鏈表
mutex_lock(&dev->class->p->mutex);
/* tie the class to the device */
klist_add_tail(&dev->knode_class,
&dev->class->p->klist_devices);
/* notify any interfaces that the device is here */
list_for_each_entry(class_intf, //調(diào)用device所屬的class中所有class_interface的add_dev
&dev->class->p->interfaces, node)
if (class_intf->add_dev)
class_intf->add_dev(dev, class_intf);
mutex_unlock(&dev->class->p->mutex);
}
done:
put_device(dev);
return error;
SysEntryError:
if (MAJOR(dev->devt))
device_remove_file(dev, &dev_attr_dev);
DevAttrError:
device_pm_remove(dev);
dpm_sysfs_remove(dev);
DPMError:
bus_remove_device(dev);
BusError:
device_remove_attrs(dev);
AttrsError:
device_remove_class_symlinks(dev);
SymlinkError:
device_remove_file(dev, &dev_attr_uevent);
attrError:
kobject_uevent(&dev->kobj, KOBJ_REMOVE);
kobject_del(&dev->kobj);
Error:
cleanup_device_parent(dev);
put_device(parent);
name_error:
kfree(dev->p);
dev->p = NULL;
goto done;
}
EXPORT_SYMBOL_GPL(device_add);設(shè)備添加到系統(tǒng)主要流程都在device_add函數(shù)實現(xiàn),上面代碼的注釋基本把主要函數(shù)的作用進行了描述。值得關(guān)注的一個函數(shù)便是bus_probe_device,該函數(shù)完成將設(shè)備綁定到驅(qū)動的動作。 void bus_probe_device(struct device *dev)
{
struct bus_type *bus = dev->bus;
struct subsys_interface *sif;
if (!bus)
return;
if (bus->p->drivers_autoprobe) //如果bus允許自動探測
device_initial_probe(dev); //主要功能
mutex_lock(&bus->p->mutex);
list_for_each_entry(sif, &bus->p->interfaces, node) //將設(shè)備綁定到接口
if (sif->add_dev)
sif->add_dev(dev, sif);
mutex_unlock(&bus->p->mutex);
}
void device_initial_probe(struct device *dev)
{
__device_attach(dev, true);
}
static int __device_attach(struct device *dev, bool allow_async)
{
int ret = 0;
device_lock(dev);
if (dev->driver) { //指定了device所要綁定的driver
if (klist_node_attached(&dev->p->knode_driver)) { //檢查knode_driver是否綁定到鏈表
ret = 1;
goto out_unlock;
}
ret = device_bind_driver(dev); //綁定,修改相應鏈表
if (ret == 0)
ret = 1;
else {
dev->driver = NULL;
ret = 0;
}
} else { //沒有指定device要綁定的driver
struct device_attach_data data = {
.dev = dev,
.check_async = allow_async,
.want_async = false,
};
if (dev->parent)
pm_runtime_get_sync(dev->parent);
//遍歷bus中所有驅(qū)動,嘗試attach
ret = bus_for_each_drv(dev->bus, NULL, &data,
__device_attach_driver);
if (!ret && allow_async && data.have_async) {
/*
* If we could not find appropriate driver
* synchronously and we are allowed to do
* async probes and there are drivers that
* want to probe asynchronously, we'll
* try them.
*/
dev_dbg(dev, "scheduling asynchronous probe\n");
get_device(dev);
async_schedule(__device_attach_async_helper, dev);
} else {
pm_request_idle(dev);
}
if (dev->parent)
pm_runtime_put(dev->parent);
}
out_unlock:
device_unlock(dev);
return ret;
}通過上面3個函數(shù)的追蹤,__device_attach函數(shù)遍歷bus所有的驅(qū)動,嘗試執(zhí)行attach,具體調(diào)用__device_attach_driver函數(shù)。 static int __device_attach_driver(struct device_driver *drv, void *_data)
{
struct device_attach_data *data = _data;
struct device *dev = data->dev;
bool async_allowed;
/*
* Check if device has already been claimed. This may
* happen with driver loading, device discovery/registration,
* and deferred probe processing happens all at once with
* multiple threads.
*/
if (dev->driver)
return -EBUSY;
if (!driver_match_device(drv, dev)) //調(diào)用bus的match函數(shù),測試是否匹配
return 0;
//進一步probe設(shè)備,需要設(shè)備已經(jīng)注冊
async_allowed = driver_allows_async_probing(drv);
if (async_allowed)
data->have_async = true;
//如果允許異步探測,則先返回
if (data->check_async && async_allowed != data->want_async)
return 0;
return driver_probe_device(drv, dev);
}
int driver_probe_device(struct device_driver *drv, struct device *dev)
{
int ret = 0;
if (!device_is_registered(dev)) //檢查device是否register
return -ENODEV;
pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
drv->bus->name, __func__, dev_name(dev), drv->name);
if (dev->parent)
pm_runtime_get_sync(dev->parent);
pm_runtime_barrier(dev);
ret = really_probe(dev, drv); //真正執(zhí)行探測
pm_request_idle(dev);
if (dev->parent)
pm_runtime_put(dev->parent);
return ret;
}從上面兩個函數(shù)來看,真正執(zhí)行probe的函數(shù)是really_probe。 //返回1表示成功,返回0表示中間步驟出現(xiàn)異常,已回滾所有操作。
static int really_probe(struct device *dev, struct device_driver *drv)
{
int ret = 0;
int local_trigger_count = atomic_read(&deferred_trigger_count);
atomic_inc(&probe_count);
pr_debug("bus: '%s': %s: probing driver %s with device %s\n",
drv->bus->name, __func__, drv->name, dev_name(dev));
WARN_ON(!list_empty(&dev->devres_head));
dev->driver = drv; //將設(shè)備的driver指向當前驅(qū)動
/* If using pinctrl, bind pins now before probing */
ret = pinctrl_bind_pins(dev);
if (ret)
goto probe_failed;
if (driver_sysfs_add(dev)) { //在sysfs驅(qū)動目錄中創(chuàng)建指向設(shè)備的符號鏈接,同時在設(shè)備目錄中創(chuàng)建指向驅(qū)動的符號鏈接
printk(KERN_ERR "%s: driver_sysfs_add(%s) failed\n",
__func__, dev_name(dev));
goto probe_failed;
}
if (dev->pm_domain && dev->pm_domain->activate) {
ret = dev->pm_domain->activate(dev);
if (ret)
goto probe_failed;
}
/*
* Ensure devices are listed in devices_kset in correct order
* It's important to move Dev to the end of devices_kset before
* calling .probe, because it could be recursive and parent Dev
* should always go first
*/
devices_kset_move_last(dev);
if (dev->bus->probe) {
ret = dev->bus->probe(dev); //優(yōu)先調(diào)用bus_type中的probe方法
if (ret)
goto probe_failed;
} else if (drv->probe) {
ret = drv->probe(dev); //其次,調(diào)用driver中的probe方法
if (ret)
goto probe_failed;
}
pinctrl_init_done(dev);
if (dev->pm_domain && dev->pm_domain->sync)
dev->pm_domain->sync(dev);
driver_bound(dev); //將設(shè)備鏈入驅(qū)動的設(shè)備鏈表
ret = 1;
pr_debug("bus: '%s': %s: bound device %s to driver %s\n",
drv->bus->name, __func__, dev_name(dev), drv->name);
goto done;
probe_failed: //探測失敗, 回滾操作
devres_release_all(dev);
driver_sysfs_remove(dev);
dev->driver = NULL;
dev_set_drvdata(dev, NULL);
if (dev->pm_domain && dev->pm_domain->dismiss)
dev->pm_domain->dismiss(dev);
switch (ret) {
case -EPROBE_DEFER:
/* Driver requested deferred probing */
dev_dbg(dev, "Driver %s requests probe deferral\n", drv->name);
driver_deferred_probe_add(dev);
/* Did a trigger occur while probing? Need to re-trigger if yes */
if (local_trigger_count != atomic_read(&deferred_trigger_count))
driver_deferred_probe_trigger();
break;
case -ENODEV:
case -ENXIO:
pr_debug("%s: probe of %s rejects match %d\n",
drv->name, dev_name(dev), ret);
break;
default:
/* driver matched but the probe failed */
printk(KERN_WARNING
"%s: probe of %s failed with error %d\n",
drv->name, dev_name(dev), ret);
}
/*
* Ignore errors returned by ->probe so that the next driver can try
* its luck.
*/
ret = 0;
done:
atomic_dec(&probe_count);
wake_up(&probe_waitqueue);
return ret;
}到此,設(shè)備添加到系統(tǒng)的主要流程便基本清楚,不再往下跟蹤。 驅(qū)動驅(qū)動數(shù)據(jù)結(jié)構(gòu)struct device_driver {
const char *name; //driver名稱
struct bus_type *bus; //driver所屬的bus_type
struct module *owner;
const char *mod_name; /* used for built-in modules */
bool suppress_bind_attrs; /* disables bind/unbind via sysfs */
enum probe_type probe_type;
const struct of_device_id *of_match_table;
const struct acpi_device_id *acpi_match_table;
int (*probe) (struct device *dev); //在device綁定到driver之前,對device進行初始化
int (*remove) (struct device *dev); //在device解綁到driver時,回調(diào)
void (*shutdown) (struct device *dev);
int (*suspend) (struct device *dev, pm_message_t state);
int (*resume) (struct device *dev);
const struct attribute_group **groups; //driver的屬性
const struct dev_pm_ops *pm; //電源相關(guān)
struct driver_private *p; //driver私有結(jié)構(gòu)
};
struct driver_private {
struct kobject kobj;
struct klist klist_devices; //driver所支持的device鏈表
struct klist_node knode_bus; //鏈入bus_type的驅(qū)動鏈表中
struct module_kobject *mkobj;
struct device_driver *driver; //指向driver
};device_driver結(jié)構(gòu)中,bus域指向驅(qū)動所屬的總線類型,knode_bus域用于鏈入總線類型的驅(qū)動鏈表。driver_private結(jié)構(gòu)中的klist_devices域用于鏈接所有綁定到本驅(qū)動的設(shè)備。 驅(qū)動注冊與反注冊驅(qū)動在加載時,需要將其注冊到總線類型,調(diào)用driver_register實現(xiàn): int driver_register(struct device_driver *drv)
{
int ret;
struct device_driver *other;
BUG_ON(!drv->bus->p); //確保bus已經(jīng)注冊到驅(qū)動模型中
//如果bus_type和driver都實現(xiàn)了同一個回調(diào),優(yōu)先使用bus_type的回調(diào)函數(shù),打印告警信息
if ((drv->bus->probe && drv->probe) ||
(drv->bus->remove && drv->remove) ||
(drv->bus->shutdown && drv->shutdown))
printk(KERN_WARNING "Driver '%s' needs updating - please use "
"bus_type methods\n", drv->name);
other = driver_find(drv->name, drv->bus); //根據(jù)名字查找驅(qū)動
if (other) {
printk(KERN_ERR "Error: Driver '%s' is already registered, "
"aborting...\n", drv->name);
return -EBUSY;
}
ret = bus_add_driver(drv); //將driver添加到bus
if (ret)
return ret;
ret = driver_add_groups(drv, drv->groups); //創(chuàng)建driver屬性文件
if (ret) {
bus_remove_driver(drv);
return ret;
}
kobject_uevent(&drv->p->kobj, KOBJ_ADD); //發(fā)送ADD事件到用戶空間
return ret;
}
EXPORT_SYMBOL_GPL(driver_register);添加driver到bus_type,由bus_add_driver實現(xiàn): int bus_add_driver(struct device_driver *drv)
{
struct bus_type *bus;
struct driver_private *priv;
int error = 0;
bus = bus_get(drv->bus);
if (!bus)
return -EINVAL;
pr_debug("bus: '%s': add driver %s\n", bus->name, drv->name);
priv = kzalloc(sizeof(*priv), GFP_KERNEL); //分配driver_private結(jié)構(gòu)空間
if (!priv) {
error = -ENOMEM;
goto out_put_bus;
}
klist_init(&priv->klist_devices, NULL, NULL); //初始化driver設(shè)備鏈表
priv->driver = drv; //關(guān)聯(lián)device_driver和driver_private
drv->p = priv;
priv->kobj.kset = bus->p->drivers_kset; //driver_private中的kobj的kset域指向subsys中的drivers_kset
error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL, //添加driver到sysfs
"%s", drv->name);
if (error)
goto out_unregister;
klist_add_tail(&priv->knode_bus, &bus->p->klist_drivers); //添加driver到bus的驅(qū)動鏈表中
if (drv->bus->p->drivers_autoprobe) { //自動探測
if (driver_allows_async_probing(drv)) { //允許異步執(zhí)行probe
pr_debug("bus: '%s': probing driver %s asynchronously\n",
drv->bus->name, drv->name);
async_schedule(driver_attach_async, drv); //異步probe
} else {
error = driver_attach(drv); //同步probe
if (error)
goto out_unregister;
}
}
module_add_driver(drv->owner, drv); //驅(qū)動實現(xiàn)的模塊
error = driver_create_file(drv, &driver_attr_uevent); //在driver中添加uevent屬性文件
if (error) {
printk(KERN_ERR "%s: uevent attr (%s) failed\n",
__func__, drv->name);
}
error = driver_add_groups(drv, bus->drv_groups); //添加driver的屬性文件
if (error) {
/* How the hell do we get out of this pickle? Give up */
printk(KERN_ERR "%s: driver_create_groups(%s) failed\n",
__func__, drv->name);
}
if (!drv->suppress_bind_attrs) {
error = add_bind_files(drv); //在driver目錄添加的bind和unbind兩個屬性文件
if (error) {
/* Ditto */
printk(KERN_ERR "%s: add_bind_files(%s) failed\n",
__func__, drv->name);
}
}
return 0;
out_unregister:
kobject_put(&priv->kobj);
kfree(drv->p);
drv->p = NULL;
out_put_bus:
bus_put(bus);
return error;
}bus_add_driver函數(shù)完成驅(qū)動添加到總線類型,當驅(qū)動添加完成后,如果總線類型設(shè)置了允許自動探測標志drivers_autoprobe,便可以根據(jù)是否允許異步探測調(diào)用driver_attach_async或driver_attach,driver_attach_async也是調(diào)用driver_attach: int driver_attach(struct device_driver *drv)
{
return bus_for_each_dev(drv->bus, NULL, drv, __driver_attach);
}
EXPORT_SYMBOL_GPL(driver_attach);
static int __driver_attach(struct device *dev, void *data)
{
struct device_driver *drv = data;
/*
* Lock device and try to bind to it. We drop the error
* here and always return 0, because we need to keep trying
* to bind to devices and some drivers will return an error
* simply if it didn't support the device.
*
* driver_probe_device() will spit a warning if there
* is an error.
*/
if (!driver_match_device(drv, dev)) //調(diào)用bus_type.match
return 0;
if (dev->parent) /* Needed for USB */
device_lock(dev->parent);
device_lock(dev);
if (!dev->driver)
driver_probe_device(drv, dev); //完成probe的主要函數(shù)
device_unlock(dev);
if (dev->parent)
device_unlock(dev->parent);
return 0;
}
int driver_probe_device(struct device_driver *drv, struct device *dev)
{
int ret = 0;
if (!device_is_registered(dev)) //檢查device是否register
return -ENODEV;
pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
drv->bus->name, __func__, dev_name(dev), drv->name);
if (dev->parent)
pm_runtime_get_sync(dev->parent);
pm_runtime_barrier(dev);
ret = really_probe(dev, drv); //真正執(zhí)行探測
pm_request_idle(dev);
if (dev->parent)
pm_runtime_put(dev->parent);
return ret;
}根據(jù)上面3個函數(shù),最終仍然是調(diào)用前面描述過的really_probe函數(shù)完成最后的探測。
驅(qū)動卸載時,需要調(diào)用driver_unregister函數(shù),使driver脫離總線類型: void driver_unregister(struct device_driver *drv)
{
if (!drv || !drv->p) {
WARN(1, "Unexpected driver unregister!\n");
return;
}
driver_remove_groups(drv, drv->groups); //刪除驅(qū)動的屬性文件
bus_remove_driver(drv); //從總線類型中移除驅(qū)動
}
EXPORT_SYMBOL_GPL(driver_unregister);
void bus_remove_driver(struct device_driver *drv)
{
if (!drv->bus)
return;
if (!drv->suppress_bind_attrs)
remove_bind_files(drv); //刪除驅(qū)動目錄下bind和unbind文件
driver_remove_groups(drv, drv->bus->drv_groups); //刪除總線類型的驅(qū)動屬性文件
driver_remove_file(drv, &driver_attr_uevent); //刪除驅(qū)動目錄下uevent文件
klist_remove(&drv->p->knode_bus); //從總線類型的驅(qū)動鏈表中移除驅(qū)動
pr_debug("bus: '%s': remove driver %s\n", drv->bus->name, drv->name);
driver_detach(drv); //驅(qū)動與所有綁定的設(shè)備進行解綁
module_remove_driver(drv); //驅(qū)動實現(xiàn)的模塊
kobject_put(&drv->p->kobj); //減少引用計數(shù)
bus_put(drv->bus);
}類類數(shù)據(jù)結(jié)構(gòu)struct class {
const char *name; //類名稱
struct module *owner; //指向?qū)崿F(xiàn)這個類的模塊的指針
struct class_attribute *class_attrs; //類公共屬性
const struct attribute_group **dev_groups; //歸屬與該類的設(shè)備的默認屬性
struct kobject *dev_kobj; //類鏈入sysfs的kobject
int (*dev_uevent)(struct device *dev, struct kobj_uevent_env *env); //發(fā)送事件前,設(shè)置類的特定環(huán)境變量
char *(*devnode)(struct device *dev, umode_t *mode); //創(chuàng)建設(shè)備時,返回設(shè)備名稱
void (*class_release)(struct class *class); //類釋放時回調(diào)
void (*dev_release)(struct device *dev); //設(shè)備釋放時回調(diào)
int (*suspend)(struct device *dev, pm_message_t state); //設(shè)備進入睡眠狀態(tài)時,回調(diào)
int (*resume)(struct device *dev); //設(shè)備被喚醒時,回調(diào)
const struct kobj_ns_type_operations *ns_type; //sysfs支持命名空間
const void *(*namespace)(struct device *dev); //返回設(shè)備所在的命名空間
const struct dev_pm_ops *pm; //電源相關(guān)
struct subsys_private *p; //類所屬的子系統(tǒng)私有數(shù)據(jù)結(jié)構(gòu)
};類的私有數(shù)據(jù)類型與總線類型的私有數(shù)據(jù)類型都是subsys_private,這里將不再重復描述。 類注冊與反注冊子系統(tǒng)需要使用類時,需要調(diào)用class_register函數(shù)向總線類型注冊類: #define class_register(class) ({ static struct lock_class_key __key; __class_register(class, &__key); })
int __class_register(struct class *cls, struct lock_class_key *key)
{
struct subsys_private *cp;
int error;
pr_debug("device class '%s': registering\n", cls->name);
cp = kzalloc(sizeof(*cp), GFP_KERNEL); //分配私有數(shù)據(jù)空間
if (!cp)
return -ENOMEM;
klist_init(&cp->klist_devices, klist_class_dev_get, klist_class_dev_put); //初始化該class的device鏈表
INIT_LIST_HEAD(&cp->interfaces); //初始化接口鏈表
kset_init(&cp->glue_dirs);
__mutex_init(&cp->mutex, "subsys mutex", key);
error = kobject_set_name(&cp->subsys.kobj, "%s", cls->name); //將在/sys/class/目錄下顯示該名稱
if (error) {
kfree(cp);
return error;
}
/* set the default /sys/dev directory for devices of this class */
if (!cls->dev_kobj)
cls->dev_kobj = sysfs_dev_char_kobj;
#if defined(CONFIG_BLOCK)
/* let the block class directory show up in the root of sysfs */
if (!sysfs_deprecated || cls != &block_class)
cp->subsys.kobj.kset = class_kset;
#else
cp->subsys.kobj.kset = class_kset; // 全局變量class_kset指的是 /sys/class/
#endif
cp->subsys.kobj.ktype = &class_ktype;
cp->class = cls; //class與subsys_private關(guān)聯(lián)
cls->p = cp;
error = kset_register(&cp->subsys); //在/sys/class/目錄下創(chuàng)建該類對應的目錄
if (error) {
kfree(cp);
return error;
}
error = add_class_attrs(class_get(cls)); //在/sys/class/xxx/目錄下創(chuàng)建類屬性文件
class_put(cls);
return error;
}
EXPORT_SYMBOL_GPL(__class_register);類的注冊比較簡單,注釋已經(jīng)比較詳細。當子系統(tǒng)需要卸載類時,需要調(diào)用class_register函數(shù): void class_unregister(struct class *cls)
{
pr_debug("device class '%s': unregistering\n", cls->name);
remove_class_attrs(cls); //刪除/sys/class/xxx/目錄下的類屬性文件
kset_unregister(&cls->p->subsys); //刪除/sys/class/目錄
}接口接口數(shù)據(jù)結(jié)構(gòu)struct class_interface {
struct list_head node; //鏈入class中
struct class *class; //指向所屬的class
//在接口被添加或者設(shè)備被添加到接口所在的類時,從接口中添加或刪除設(shè)備
int (*add_dev) (struct device *, struct class_interface *);
void (*remove_dev) (struct device *, struct class_interface *);
};接口注冊與反注冊向類中注冊接口,需要調(diào)用class_interface_register函數(shù)完成: int class_interface_register(struct class_interface *class_intf)
{
struct class *parent;
struct class_dev_iter iter;
struct device *dev;
if (!class_intf || !class_intf->class) //確保class和class_interface都存在
return -ENODEV;
parent = class_get(class_intf->class); //增加引用計數(shù),并返回接口所屬的class
if (!parent)
return -EINVAL;
mutex_lock(&parent->p->mutex);
list_add_tail(&class_intf->node, &parent->p->interfaces); //將class_interface添加到class的接口鏈表
if (class_intf->add_dev) { //如果接口設(shè)置了add_dev方法,對該class的所有device調(diào)用
class_dev_iter_init(&iter, parent, NULL, NULL);
while ((dev = class_dev_iter_next(&iter)))
class_intf->add_dev(dev, class_intf); //接口方法作用于設(shè)備
class_dev_iter_exit(&iter);
}
mutex_unlock(&parent->p->mutex);
return 0;
}從類中刪除接口,需要調(diào)用class_interface_unregister函數(shù)完成: void class_interface_unregister(struct class_interface *class_intf)
{
struct class *parent = class_intf->class;
struct class_dev_iter iter;
struct device *dev;
if (!parent)
return;
mutex_lock(&parent->p->mutex);
list_del_init(&class_intf->node); //將class_interface從class的接口鏈表中刪除
if (class_intf->remove_dev) { //如果接口設(shè)置了remove_dev方法,對該class的所有device調(diào)用
class_dev_iter_init(&iter, parent, NULL, NULL);
while ((dev = class_dev_iter_next(&iter)))
class_intf->remove_dev(dev, class_intf); //接口方法作用于設(shè)備
class_dev_iter_exit(&iter);
}
mutex_unlock(&parent->p->mutex);
class_put(parent);
}基于設(shè)備驅(qū)動模型實現(xiàn)子系統(tǒng)Linux設(shè)備驅(qū)動模型已經(jīng)將每種對象的關(guān)系,sysfs的呈現(xiàn)方式已經(jīng)實現(xiàn)了。實現(xiàn)子系統(tǒng)只需要定義業(yè)務(wù)自身的總線類型, 設(shè)備, 驅(qū)動, 類, 接口分別”繼承”bus_type, device, driver, class, class_interface。并根據(jù)具體業(yè)務(wù)實現(xiàn)各個結(jié)構(gòu)規(guī)定的回調(diào)函數(shù)。最后調(diào)用上述的注冊函數(shù)添加到系統(tǒng),便完成子系統(tǒng)的開發(fā)。 SCSI子系統(tǒng)之概述Linux SCSI子系統(tǒng)的分層架構(gòu):
Linux SCSI模型Linux SCSI模型是內(nèi)核的抽象,主機適配器連接主機IO總線(如PCI總線)和存儲IO總線(如SCSI總線)。一臺計算機可以有多個主機適配器,而主機適配器可以控制一條或多條SCSI總線,一條總線可以有多個目標節(jié)點與之相連,并且一個目標節(jié)點可以有多個邏輯單元。 在Linux SCSI子系統(tǒng)中,內(nèi)核中的目標節(jié)點(target)對應SCSI磁盤,SCSI磁盤中可以有多個邏輯單元,統(tǒng)一由磁盤控制器控制,這些邏輯單元才是真正作為IO終點的存儲設(shè)備,內(nèi)核用設(shè)備(device)對邏輯單元進行抽象;內(nèi)核中的Host對應主機適配器(物理的HBA/RAID卡,虛擬的iscsi target) 內(nèi)核使用四元組 來唯一標識一個scsi的邏輯單元,在sysfs中查看sda磁盤<2:0:0:0>顯示如下: root@ubuntu16:/home/comet/Costor/bin# ls /sys/bus/scsi/devices/2\:0\:0\:0/block/sda/ alignment_offset device events_poll_msecs integrity removable sda5 subsystem bdi discard_alignment ext_range power ro size trace capability events holders queue sda1 slaves uevent dev events_async inflight range sda2 stat root@ubuntu16:/home/comet/Costor/bin# cat /sys/bus/scsi/devices/2\:0\:0\:0/block/sda/dev 8:0 root@ubuntu16:/home/comet/Costor/bin# ll /dev/sda brw-rw---- 1 root disk 8, 0 Sep 19 11:36 /dev/sda
SCSI命令SCSI 命令是在 Command Descriptor Block (CDB) 中定義的。CDB 包含了用來定義要執(zhí)行的特定操作的操作代碼,以及大量特定于操作的參數(shù)。
借助大約 60 種可用命令,SCSI 可適用于許多設(shè)備(包括隨機存取設(shè)備,比如磁盤和像磁帶這樣的順序存儲設(shè)備)。SCSI 也提供了專門的命令以訪問箱體服務(wù)(比如存儲箱體內(nèi)部當前的傳感和溫度)。 核心數(shù)據(jù)結(jié)構(gòu)主機適配器模板scsi_host_template主機適配器模板是相同型號主機適配器的公共內(nèi)容,包括請求隊列深度,SCSI命令處理回調(diào)函數(shù),錯誤處理恢復函數(shù)。分配主機適配器結(jié)構(gòu)時,需要使用主機適配器模板來賦值。在編寫SCSI低層驅(qū)動時,第一步便是定義模板scsi_host_template,之后才能有模板生成主機適配器。 struct scsi_host_template {
struct module *module; //指向使用該模板實現(xiàn)的scsi_host,低層驅(qū)動模塊。
const char *name; //主機適配器名稱
int (* detect)(struct scsi_host_template *);
int (* release)(struct Scsi_Host *);
const char *(* info)(struct Scsi_Host *); //返回HBA相關(guān)信息,可選實現(xiàn)
int (* ioctl)(struct scsi_device *dev, int cmd, void __user *arg); //用戶空間ioctl函數(shù)的實現(xiàn),可選實現(xiàn)
#ifdef CONFIG_COMPAT
//通過該函數(shù),支持32位系統(tǒng)的用戶態(tài)ioctl函數(shù)
int (* compat_ioctl)(struct scsi_device *dev, int cmd, void __user *arg);
#endif
//將scsi命令放進低層驅(qū)動的隊列,由中間層調(diào)用,必須實現(xiàn)
int (* queuecommand)(struct Scsi_Host *, struct scsi_cmnd *);
//以下5個函數(shù)是錯誤處理回調(diào)函數(shù),由中間層按照嚴重程度調(diào)用
int (* eh_abort_handler)(struct scsi_cmnd *); //Abort
int (* eh_device_reset_handler)(struct scsi_cmnd *); //Device Reset
int (* eh_target_reset_handler)(struct scsi_cmnd *); //Target Reset
int (* eh_bus_reset_handler)(struct scsi_cmnd *); //Bus Reset
int (* eh_host_reset_handler)(struct scsi_cmnd *); //Host Reset
//當掃描到新磁盤時調(diào)用,中間層回調(diào)這個函數(shù)中可以分配和初始化低層驅(qū)動所需要的結(jié)構(gòu)
int (* slave_alloc)(struct scsi_device *)
//在設(shè)備受到INQUIRY命令后,執(zhí)行相關(guān)的配置操作
int (* slave_configure)(struct scsi_device *);
//在scsi設(shè)備銷毀之前調(diào)用,中間層回調(diào)用于釋放slave_alloc分配的私有數(shù)據(jù)
void (* slave_destroy)(struct scsi_device *);
//當發(fā)現(xiàn)新的target,中間層調(diào)用,用戶分配target私有數(shù)據(jù)
int (* target_alloc)(struct scsi_target *);
//在target被銷毀之前,中間層調(diào)用,低層驅(qū)動實現(xiàn),用于釋放target_alloc分配的數(shù)據(jù)
void (* target_destroy)(struct scsi_target *);
//需要自定義掃描target邏輯時,中間層循環(huán)檢查返回值,直到該函數(shù)返回1,表示掃描完成
int (* scan_finished)(struct Scsi_Host *, unsigned long);
//需要自定義掃描target邏輯時,掃描開始前回調(diào)
void (* scan_start)(struct Scsi_Host *);
//改變主機適配器的隊列深度,返回設(shè)置的隊列深度
int (* change_queue_depth)(struct scsi_device *, int);
//返回磁盤的BIOS參數(shù),如size, device, list (heads, sectors, cylinders)
int (* bios_param)(struct scsi_device *, struct block_device *,
sector_t, int []);
void (*unlock_native_capacity)(struct scsi_device *);
//在procfs中的讀寫操作回調(diào)
int (*show_info)(struct seq_file *, struct Scsi_Host *);
int (*write_info)(struct Scsi_Host *, char *, int);
//中間層發(fā)現(xiàn)scsi命令超時回調(diào)
enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *);
//通過sysfs屬性reset主機適配器時,回調(diào)
int (*host_reset)(struct Scsi_Host *shost, int reset_type);
#define SCSI_ADAPTER_RESET 1
#define SCSI_FIRMWARE_RESET 2
const char *proc_name; //在proc文件系統(tǒng)的名稱
struct proc_dir_entry *proc_dir;
int can_queue; //主機適配器能同時接受的命令數(shù)
int this_id;
/*
* This determines the degree to which the host adapter is capable
* of scatter-gather.
*/ //聚散列表的參數(shù)
unsigned short sg_tablesize;
unsigned short sg_prot_tablesize;
/*
* Set this if the host adapter has limitations beside segment count.
*/ //單個scsi命令能夠訪問的扇區(qū)最大數(shù)量
unsigned int max_sectors;
/*
* DMA scatter gather segment boundary limit. A segment crossing this
* boundary will be split in two.
*/
unsigned long dma_boundary; //DMA聚散段邊界值,超過該值將被切割成兩個
#define SCSI_DEFAULT_MAX_SECTORS 1024
short cmd_per_lun;
/*
* present contains counter indicating how many boards of this
* type were found when we did the scan.
*/
unsigned char present;
/* If use block layer to manage tags, this is tag allocation policy */
int tag_alloc_policy;
/*
* Track QUEUE_FULL events and reduce queue depth on demand.
*/
unsigned track_queue_depth:1;
/*
* This specifies the mode that a LLD supports.
*/
unsigned supported_mode:2; //低層驅(qū)動支持的模式(initiator或target)
/*
* True if this host adapter uses unchecked DMA onto an ISA bus.
*/
unsigned unchecked_isa_dma:1;
unsigned use_clustering:1;
/*
* True for emulated SCSI host adapters (e.g. ATAPI).
*/
unsigned emulated:1;
/*
* True if the low-level driver performs its own reset-settle delays.
*/
unsigned skip_settle_delay:1;
/* True if the controller does not support WRITE SAME */
unsigned no_write_same:1;
/*
* True if asynchronous aborts are not supported
*/
unsigned no_async_abort:1;
/*
* Countdown for host blocking with no commands outstanding.
*/
unsigned int max_host_blocked; //主機適配器發(fā)送隊列的低閥值,允許累計多個命令同時派發(fā)
#define SCSI_DEFAULT_HOST_BLOCKED 7
/*
* Pointer to the sysfs class properties for this host, NULL terminated.
*/
struct device_attribute **shost_attrs; //主機適配器類屬性
/*
* Pointer to the SCSI device properties for this host, NULL terminated.
*/
struct device_attribute **sdev_attrs; //主機適配器設(shè)備屬性
struct list_head legacy_hosts;
u64 vendor_id;
/*
* Additional per-command data allocated for the driver.
*/ //scsi 命令緩沖池,scsi命令都是預先分配好的,保存在cmd_pool中
unsigned int cmd_size;
struct scsi_host_cmd_pool *cmd_pool;
/* temporary flag to disable blk-mq I/O path */
bool disable_blk_mq; //禁用通用塊層多隊列模式標志
};主機適配器Scsi_HostScsi_Host描述一個SCSI主機適配器,SCSI主機適配器通常是一塊基于PCI總線的擴展卡或是一個SCSI控制器芯片。每個SCSI主機適配器可以存在多個通道,一個通道實際擴展了一條SCSI總線。每個通過可以連接多個SCSI目標節(jié)點,具體連接數(shù)量與SCSI總線帶載能力有關(guān),或者受具體SCSI協(xié)議的限制。 真實的主機總線適配器是接入主機IO總線上(通常是PCI總線),在系統(tǒng)啟動時,會掃描掛載在PCI總線上的設(shè)備,此時會分配主機總線適配器。 struct Scsi_Host {
struct list_head __devices; //設(shè)備鏈表
struct list_head __targets; //目標節(jié)點鏈表
struct scsi_host_cmd_pool *cmd_pool; //scsi命令緩沖池
spinlock_t free_list_lock; //保護free_list
struct list_head free_list; /* backup store of cmd structs, scsi命令預先分配的備用命令鏈表 */
struct list_head starved_list; //scsi命令的饑餓鏈表
spinlock_t default_lock;
spinlock_t *host_lock;
struct mutex scan_mutex;/* serialize scanning activity */
struct list_head eh_cmd_q; //執(zhí)行錯誤的scsi命令的鏈表
struct task_struct * ehandler; /* Error recovery thread. 錯誤恢復線程 */
struct completion * eh_action; /* Wait for specific actions on the
host. */
wait_queue_head_t host_wait; //scsi設(shè)備恢復等待隊列
struct scsi_host_template *hostt; //主機適配器模板
struct scsi_transport_template *transportt; //指向SCSI傳輸層模板
/*
* Area to keep a shared tag map (if needed, will be
* NULL if not).
*/
union {
struct blk_queue_tag *bqt;
struct blk_mq_tag_set tag_set; //SCSI支持多隊列時使用
};
//已經(jīng)派發(fā)給主機適配器(低層驅(qū)動)的scsi命令數(shù)
atomic_t host_busy; /* commands actually active on low-level */
atomic_t host_blocked; //阻塞的scsi命令數(shù)
unsigned int host_failed; /* commands that failed.
protected by host_lock */
unsigned int host_eh_scheduled; /* EH scheduled without command */
unsigned int host_no; /* Used for IOCTL_GET_IDLUN, /proc/scsi et al. 系統(tǒng)內(nèi)唯一標識 */
/* next two fields are used to bound the time spent in error handling */
int eh_deadline;
unsigned long last_reset; //記錄上次reset時間
/*
* These three parameters can be used to allow for wide scsi,
* and for host adapters that support multiple busses
* The last two should be set to 1 more than the actual max id
* or lun (e.g. 8 for SCSI parallel systems).
*/
unsigned int max_channel; //主機適配器的最大通道編號
unsigned int max_id; //主機適配器目標節(jié)點最大編號
u64 max_lun; //主機適配器lun最大編號
unsigned int unique_id;
/*
* The maximum length of SCSI commands that this host can accept.
* Probably 12 for most host adapters, but could be 16 for others.
* or 260 if the driver supports variable length cdbs.
* For drivers that don't set this field, a value of 12 is
* assumed.
*/
unsigned short max_cmd_len; //主機適配器可以接受的最長的SCSI命令
//下面這段在scsi_host_template中也有,由template中的字段賦值
int this_id;
int can_queue;
short cmd_per_lun;
short unsigned int sg_tablesize;
short unsigned int sg_prot_tablesize;
unsigned int max_sectors;
unsigned long dma_boundary;
/*
* In scsi-mq mode, the number of hardware queues supported by the LLD.
*
* Note: it is assumed that each hardware queue has a queue depth of
* can_queue. In other words, the total queue depth per host
* is nr_hw_queues * can_queue.
*/
unsigned nr_hw_queues; //在scsi-mq模式中,低層驅(qū)動所支持的硬件隊列的數(shù)量
/*
* Used to assign serial numbers to the cmds.
* Protected by the host lock.
*/
unsigned long cmd_serial_number; //指向命令序列號unsigned active_mode:2; //標識是initiator或target
unsigned unchecked_isa_dma:1;
unsigned use_clustering:1;
/*
* Host has requested that no further requests come through for the
* time being.
*/
unsigned host_self_blocked:1; //表示低層驅(qū)動要求阻塞該主機適配器,此時中間層不會繼續(xù)派發(fā)命令到主機適配器隊列中
/*
* Host uses correct SCSI ordering not PC ordering. The bit is
* set for the minority of drivers whose authors actually read
* the spec ;).
*/
unsigned reverse_ordering:1;
/* Task mgmt function in progress */
unsigned tmf_in_progress:1; //任務(wù)管理函數(shù)正在執(zhí)行
/* Asynchronous scan in progress */
unsigned async_scan:1; //異步掃描正在執(zhí)行
/* Don't resume host in EH */
unsigned eh_noresume:1; //在錯誤處理過程不恢復主機適配器
/* The controller does not support WRITE SAME */
unsigned no_write_same:1;
unsigned use_blk_mq:1; //是否使用SCSI多隊列模式
unsigned use_cmd_list:1;
/* Host responded with short (<36 bytes) INQUIRY result */
unsigned short_inquiry:1;
/*
* Optional work queue to be utilized by the transport
*/
char work_q_name[20]; //被scsi傳輸層使用的工作隊列
struct workqueue_struct *work_q;
/*
* Task management function work queue
*/
struct workqueue_struct *tmf_work_q; //任務(wù)管理函數(shù)工作隊列
/* The transport requires the LUN bits NOT to be stored in CDB[1] */
unsigned no_scsi2_lun_in_cdb:1;
/*
* Value host_blocked counts down from
*/
unsigned int max_host_blocked; //在派發(fā)隊列中累計命令達到這個數(shù)值,才開始喚醒主機適配器
/* Protection Information */
unsigned int prot_capabilities;
unsigned char prot_guard_type;
/*
* q used for scsi_tgt msgs, async events or any other requests that
* need to be processed in userspace
*/
struct request_queue *uspace_req_q; //需要在用戶空間處理的scsi_tgt消息、異步事件或其他請求的請求隊列
/* legacy crap */
unsigned long base;
unsigned long io_port; //I/O端口編號
unsigned char n_io_port;
unsigned char dma_channel;
unsigned int irq;
enum scsi_host_state shost_state; //狀態(tài)
/* ldm bits */ //shost_gendev: 內(nèi)嵌通用設(shè)備,SCSI設(shè)備通過這個域鏈入SCSI總線類型(scsi_bus_type)的設(shè)備鏈表
struct device shost_gendev, shost_dev;
//shost_dev: 內(nèi)嵌類設(shè)備, SCSI設(shè)備通過這個域鏈入SCSI主機適配器類型(shost_class)的設(shè)備鏈表
/*
* List of hosts per template.
*
* This is only for use by scsi_module.c for legacy templates.
* For these access to it is synchronized implicitly by
* module_init/module_exit.
*/
struct list_head sht_legacy_list;
/*
* Points to the transport data (if any) which is allocated
* separately
*/
void *shost_data; //指向獨立分配的傳輸層數(shù)據(jù),由SCSI傳輸層使用
/*
* Points to the physical bus device we'd use to do DMA
* Needed just in case we have virtual hosts.
*/
struct device *dma_dev;
/*
* We should ensure that this is aligned, both for better performance
* and also because some compilers (m68k) don't automatically force
* alignment to a long boundary.
*/ //主機適配器專有數(shù)據(jù)
unsigned long hostdata[0] /* Used for storage of host specific stuff */
__attribute__ ((aligned (sizeof(unsigned long))));
};目標節(jié)點scsi_targetscsi_target結(jié)構(gòu)中有一個內(nèi)嵌驅(qū)動模型設(shè)備,被鏈入SCSI總線類型scsi_bus_type的設(shè)備鏈表。 struct scsi_target {
struct scsi_device *starget_sdev_user; //指向正在進行I/O的scsi設(shè)備,沒有IO則指向NULL
struct list_head siblings; //鏈入主機適配器target鏈表中
struct list_head devices; //屬于該target的device鏈表
struct device dev; //通用設(shè)備,用于加入設(shè)備驅(qū)動模型
struct kref reap_ref; /* last put renders target invisible 本結(jié)構(gòu)的引用計數(shù) */
unsigned int channel; //該target所在的channel號
unsigned int id; /* target id ... replace
* scsi_device.id eventually */
unsigned int create:1; /* signal that it needs to be added */
unsigned int single_lun:1; /* Indicates we should only
* allow I/O to one of the luns
* for the device at a time. */
unsigned int pdt_1f_for_no_lun:1; /* PDT = 0x1f
* means no lun present. */
unsigned int no_report_luns:1; /* Don't use
* REPORT LUNS for scanning. */
unsigned int expecting_lun_change:1; /* A device has reported
* a 3F/0E UA, other devices on
* the same target will also. */
/* commands actually active on LLD. */
atomic_t target_busy;
atomic_t target_blocked; //當前阻塞的命令數(shù)
/*
* LLDs should set this in the slave_alloc host template callout.
* If set to zero then there is not limit.
*/
unsigned int can_queue; //同時處理的命令數(shù)
unsigned int max_target_blocked; //阻塞命令數(shù)閥值
#define SCSI_DEFAULT_TARGET_BLOCKED 3
char scsi_level; //支持的SCSI規(guī)范級別
enum scsi_target_state state; //target狀態(tài)
void *hostdata; /* available to low-level driver */
unsigned long starget_data[0]; /* for the transport SCSI傳輸層(中間層)使用 */
/* starget_data must be the last element!!!! */
} __attribute__((aligned(sizeof(unsigned long))));邏輯設(shè)備scsi_devicescsi_device描述scsi邏輯設(shè)備,代表scsi磁盤的邏輯單元lun。scsi_device描述符所代表的設(shè)備可能是另一臺存儲設(shè)備上的SATA/SAS/SCSI磁盤或SSD。操作系統(tǒng)在掃描到連接在主機適配器上的邏輯設(shè)備時,創(chuàng)建scsi_device結(jié)構(gòu),用于scsi高層驅(qū)動和該設(shè)備通信。 struct scsi_device {
struct Scsi_Host *host; //所歸屬的主機總線適配器
struct request_queue *request_queue; //請求隊列
/* the next two are protected by the host->host_lock */
struct list_head siblings; /* list of all devices on this host */ //鏈入主機總線適配器設(shè)備鏈表
struct list_head same_target_siblings; /* just the devices sharing same target id */ //鏈入target的設(shè)備鏈表
atomic_t device_busy; /* commands actually active on LLDD */
atomic_t device_blocked; /* Device returned QUEUE_FULL. */
spinlock_t list_lock;
struct list_head cmd_list; /* queue of in use SCSI Command structures */
struct list_head starved_entry; //鏈入主機適配器的"饑餓"鏈表
struct scsi_cmnd *current_cmnd; /* currently active command */ //當前正在執(zhí)行的命令
unsigned short queue_depth; /* How deep of a queue we want */
unsigned short max_queue_depth; /* max queue depth */
unsigned short last_queue_full_depth; /* These two are used by */
unsigned short last_queue_full_count; /* scsi_track_queue_full() */
unsigned long last_queue_full_time; /* last queue full time */
unsigned long queue_ramp_up_period; /* ramp up period in jiffies */
#define SCSI_DEFAULT_RAMP_UP_PERIOD (120 * HZ)
unsigned long last_queue_ramp_up; /* last queue ramp up time */
unsigned int id, channel; //scsi_device所屬的target id和所在channel通道號
u64 lun; //該設(shè)備的lun編號
unsigned int manufacturer; /* Manufacturer of device, for using 制造商
* vendor-specific cmd's */
unsigned sector_size; /* size in bytes 硬件的扇區(qū)大小 */
void *hostdata; /* available to low-level driver 專有數(shù)據(jù) */
char type; //SCSI設(shè)備類型
char scsi_level; //所支持SCSI規(guī)范的版本號,由INQUIRY命令獲得
char inq_periph_qual; /* PQ from INQUIRY data */
unsigned char inquiry_len; /* valid bytes in 'inquiry' */
unsigned char * inquiry; /* INQUIRY response data */
const char * vendor; /* [back_compat] point into 'inquiry' ... */
const char * model; /* ... after scan; point to static string */
const char * rev; /* ... "nullnullnullnull" before scan */
#define SCSI_VPD_PG_LEN 255
int vpd_pg83_len; //sense命令 0x83
unsigned char *vpd_pg83;
int vpd_pg80_len; //sense命令 0x80
unsigned char *vpd_pg80;
unsigned char current_tag; /* current tag */
struct scsi_target *sdev_target; /* used only for single_lun */
unsigned int sdev_bflags; /* black/white flags as also found in
* scsi_devinfo.[hc]. For now used only to
* pass settings from slave_alloc to scsi
* core. */
unsigned int eh_timeout; /* Error handling timeout */
unsigned removable:1;
unsigned changed:1; /* Data invalid due to media change */
unsigned busy:1; /* Used to prevent races */
unsigned lockable:1; /* Able to prevent media removal */
unsigned locked:1; /* Media removal disabled */
unsigned borken:1; /* Tell the Seagate driver to be
* painfully slow on this device */
unsigned disconnect:1; /* can disconnect */
unsigned soft_reset:1; /* Uses soft reset option */
unsigned sdtr:1; /* Device supports SDTR messages 支持同步數(shù)據(jù)傳輸 */
unsigned wdtr:1; /* Device supports WDTR messages 支持16位寬數(shù)據(jù)傳輸*/
unsigned ppr:1; /* Device supports PPR messages 支持PPR(并行協(xié)議請求)消息*/
unsigned tagged_supported:1; /* Supports SCSI-II tagged queuing */
unsigned simple_tags:1; /* simple queue tag messages are enabled */
unsigned was_reset:1; /* There was a bus reset on the bus for
* this device */
unsigned expecting_cc_ua:1; /* Expecting a CHECK_CONDITION/UNIT_ATTN
* because we did a bus reset. */
unsigned use_10_for_rw:1; /* first try 10-byte read / write */
unsigned use_10_for_ms:1; /* first try 10-byte mode sense/select */
unsigned no_report_opcodes:1; /* no REPORT SUPPORTED OPERATION CODES */
unsigned no_write_same:1; /* no WRITE SAME command */
unsigned use_16_for_rw:1; /* Use read/write(16) over read/write(10) */
unsigned skip_ms_page_8:1; /* do not use MODE SENSE page 0x08 */
unsigned skip_ms_page_3f:1; /* do not use MODE SENSE page 0x3f */
unsigned skip_vpd_pages:1; /* do not read VPD pages */
unsigned try_vpd_pages:1; /* attempt to read VPD pages */
unsigned use_192_bytes_for_3f:1; /* ask for 192 bytes from page 0x3f */
unsigned no_start_on_add:1; /* do not issue start on add */
unsigned allow_restart:1; /* issue START_UNIT in error handler */
unsigned manage_start_stop:1; /* Let HLD (sd) manage start/stop */
unsigned start_stop_pwr_cond:1; /* Set power cond. in START_STOP_UNIT */
unsigned no_uld_attach:1; /* disable connecting to upper level drivers */
unsigned select_no_atn:1;
unsigned fix_capacity:1; /* READ_CAPACITY is too high by 1 */
unsigned guess_capacity:1; /* READ_CAPACITY might be too high by 1 */
unsigned retry_hwerror:1; /* Retry HARDWARE_ERROR */
unsigned last_sector_bug:1; /* do not use multisector accesses on
SD_LAST_BUGGY_SECTORS */
unsigned no_read_disc_info:1; /* Avoid READ_DISC_INFO cmds */
unsigned no_read_capacity_16:1; /* Avoid READ_CAPACITY_16 cmds */
unsigned try_rc_10_first:1; /* Try READ_CAPACACITY_10 first */
unsigned is_visible:1; /* is the device visible in sysfs */
unsigned wce_default_on:1; /* Cache is ON by default */
unsigned no_dif:1; /* T10 PI (DIF) should be disabled */
unsigned broken_fua:1; /* Don't set FUA bit */
unsigned lun_in_cdb:1; /* Store LUN bits in CDB[1] */
atomic_t disk_events_disable_depth; /* disable depth for disk events */
DECLARE_BITMAP(supported_events, SDEV_EVT_MAXBITS); /* supported events */
DECLARE_BITMAP(pending_events, SDEV_EVT_MAXBITS); /* pending events */
struct list_head event_list; /* asserted events */
struct work_struct event_work;
unsigned int max_device_blocked; /* what device_blocked counts down from */
#define SCSI_DEFAULT_DEVICE_BLOCKED 3
atomic_t iorequest_cnt;
atomic_t iodone_cnt;
atomic_t ioerr_cnt;
struct device sdev_gendev, //內(nèi)嵌通用設(shè)備, 鏈入scsi總線類型(scsi_bus_type)的設(shè)備鏈表
sdev_dev; //內(nèi)嵌類設(shè)備,鏈入scsi設(shè)備類(sdev_class)的設(shè)備鏈表
struct execute_work ew; /* used to get process context on put */
struct work_struct requeue_work;
struct scsi_device_handler *handler; //自定義設(shè)備處理函數(shù)
void *handler_data;
enum scsi_device_state sdev_state; //scsi設(shè)備狀態(tài)
unsigned long sdev_data[0]; //scsi傳輸層使用
} __attribute__((aligned(sizeof(unsigned long))));內(nèi)核定義的SCSI命令結(jié)構(gòu)scsi_cmndscsi_cmnd結(jié)構(gòu)有SCSI中間層創(chuàng)建,傳遞到SCSI低層驅(qū)動。每個IO請求會被創(chuàng)建一個scsi_cnmd,但scsi_cmnd并不一定是時IO請求。scsi_cmnd最終轉(zhuǎn)化成一個具體的SCSI命令。除了命令描述塊之外,scsi_cmnd包含更豐富的信息,包括數(shù)據(jù)緩沖區(qū)、感測數(shù)據(jù)緩沖區(qū)、完成回調(diào)函數(shù)以及所關(guān)聯(lián)的塊設(shè)備驅(qū)動層請求等,是SCSI中間層執(zhí)行SCSI命令的上下文。 struct scsi_cmnd {
struct scsi_device *device; //指向命令所屬SCSI設(shè)備的描述符的指針
struct list_head list; /* scsi_cmnd participates in queue lists 鏈入scsi設(shè)備的命令鏈表 */
struct list_head eh_entry; /* entry for the host eh_cmd_q */
struct delayed_work abort_work;
int eh_eflags; /* Used by error handlr */
/*
* A SCSI Command is assigned a nonzero serial_number before passed
* to the driver's queue command function. The serial_number is
* cleared when scsi_done is entered indicating that the command
* has been completed. It is a bug for LLDDs to use this number
* for purposes other than printk (and even that is only useful
* for debugging).
*/
unsigned long serial_number; //scsi命令的唯一序號
/*
* This is set to jiffies as it was when the command was first
* allocated. It is used to time how long the command has
* been outstanding
*/
unsigned long jiffies_at_alloc; //分配時的jiffies, 用于計算命令處理時間
int retries; //命令重試次數(shù)
int allowed; //允許的重試次數(shù)
unsigned char prot_op; //保護操作(DIF和DIX)
unsigned char prot_type; //DIF保護類型
unsigned char prot_flags;
unsigned short cmd_len; //命令長度
enum dma_data_direction sc_data_direction; //命令傳輸方向
/* These elements define the operation we are about to perform */
unsigned char *cmnd; //scsi規(guī)范格式的命令字符串
/* These elements define the operation we ultimately want to perform */
struct scsi_data_buffer sdb; //scsi命令數(shù)據(jù)緩沖區(qū)
struct scsi_data_buffer *prot_sdb; //scsi命令保護信息緩沖區(qū)
unsigned underflow; /* Return error if less than
this amount is transferred */
unsigned transfersize; /* How much we are guaranteed to //傳輸單位
transfer with each SCSI transfer
(ie, between disconnect /
reconnects. Probably == sector
size */
struct request *request; /* The command we are 通用塊層的請求描述符
working on */
#define SCSI_SENSE_BUFFERSIZE 96
unsigned char *sense_buffer; //scsi命令感測數(shù)據(jù)緩沖區(qū)
/* obtained by REQUEST SENSE when
* CHECK CONDITION is received on original
* command (auto-sense) */
/* Low-level done function - can be used by low-level driver to point
* to completion function. Not used by mid/upper level code. */
void (*scsi_done) (struct scsi_cmnd *); //scsi命令在低層驅(qū)動完成時,回調(diào)
/*
* The following fields can be written to by the host specific code.
* Everything else should be left alone.
*/
struct scsi_pointer SCp; /* Scratchpad used by some host adapters */
unsigned char *host_scribble; /* The host adapter is allowed to
* call scsi_malloc and get some memory
* and hang it here. The host adapter
* is also expected to call scsi_free
* to release this memory. (The memory
* obtained by scsi_malloc is guaranteed
* to be at an address < 16Mb). */
int result; /* Status code from lower level driver */
int flags; /* Command flags */
unsigned char tag; /* SCSI-II queued command tag */
};驅(qū)動scsi_driverstruct scsi_driver {
struct device_driver gendrv; // "繼承"device_driver
void (*rescan)(struct device *); //重新掃描前調(diào)用的回調(diào)函數(shù)
int (*init_command)(struct scsi_cmnd *);
void (*uninit_command)(struct scsi_cmnd *);
int (*done)(struct scsi_cmnd *); //當?shù)蛯域?qū)動完成一個scsi命令時調(diào)用,用于計算已經(jīng)完成的字節(jié)數(shù)
int (*eh_action)(struct scsi_cmnd *, int); //錯誤處理回調(diào)
};設(shè)備模型
struct bus_type scsi_bus_type = {
.name = "scsi", // 對應/sys/bus/scsi
.match = scsi_bus_match,
.uevent = scsi_bus_uevent,
#ifdef CONFIG_PM
.pm = &scsi_bus_pm_ops,
#endif
};
EXPORT_SYMBOL_GPL(scsi_bus_type);
static struct class shost_class = {
.name = "scsi_host", // 對應/sys/class/scsi_host
.dev_release = scsi_host_cls_release,
};初始化過程操作系統(tǒng)啟動時,會加載scsi子系統(tǒng),入口函數(shù)是init_scsi,使用subsys_initcall定義: static int __init init_scsi(void)
{
int error;
error = scsi_init_queue(); //初始化聚散列表所需要的存儲池
if (error)
return error;
error = scsi_init_procfs(); //初始化procfs中與scsi相關(guān)的目錄項
if (error)
goto cleanup_queue;
error = scsi_init_devinfo();//設(shè)置scsi動態(tài)設(shè)備信息列表
if (error)
goto cleanup_procfs;
error = scsi_init_hosts(); //注冊shost_class類,在/sys/class/目錄下創(chuàng)建scsi_host子目錄
if (error)
goto cleanup_devlist;
error = scsi_init_sysctl(); //注冊SCSI系統(tǒng)控制表
if (error)
goto cleanup_hosts;
error = scsi_sysfs_register(); //注冊scsi_bus_type總線類型和sdev_class類
if (error)
goto cleanup_sysctl;
scsi_netlink_init(); //初始化SCSI傳輸netlink接口
printk(KERN_NOTICE "SCSI subsystem initialized\n");
return 0;
cleanup_sysctl:
scsi_exit_sysctl();
cleanup_hosts:
scsi_exit_hosts();
cleanup_devlist:
scsi_exit_devinfo();
cleanup_procfs:
scsi_exit_procfs();
cleanup_queue:
scsi_exit_queue();
printk(KERN_ERR "SCSI subsystem failed to initialize, error = %d\n",
-error);
return error;
}scsi_init_hosts函數(shù)初始化scsi子系統(tǒng)主機適配器所屬的類shost_class: int scsi_init_hosts(void)
{
return class_register(&shost_class);
}scsi_sysfs_register函數(shù)初始化scsi子系統(tǒng)總線類型scsi_bus_type和設(shè)備所屬的類sdev_class類: int scsi_sysfs_register(void)
{
int error;
error = bus_register(&scsi_bus_type);
if (!error) {
error = class_register(&sdev_class);
if (error)
bus_unregister(&scsi_bus_type);
}
return error;
}scsi低層驅(qū)動是面向主機適配器的,低層驅(qū)動被加載時,需要添加主機適配器。主機適配器添加有兩種方式:1.在PCI子系統(tǒng)掃描掛載驅(qū)動時添加;2.手動方式添加。所有基于硬件PCI接口的主機適配器都采用第一種方式。添加主機適配器包括兩個步驟: struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
{
struct Scsi_Host *shost;
gfp_t gfp_mask = GFP_KERNEL;
if (sht->unchecked_isa_dma && privsize)
gfp_mask |= __GFP_DMA;
//一次分配Scsi_Host和私有數(shù)據(jù)空間
shost = kzalloc(sizeof(struct Scsi_Host) + privsize, gfp_mask);
if (!shost)
return NULL;
shost->host_lock = &shost->default_lock;
spin_lock_init(shost->host_lock);
shost->shost_state = SHOST_CREATED; //更新狀態(tài)
INIT_LIST_HEAD(&shost->__devices); //初始化scsi設(shè)備鏈表
INIT_LIST_HEAD(&shost->__targets); //初始化target鏈表
INIT_LIST_HEAD(&shost->eh_cmd_q); //初始化執(zhí)行錯誤的scsi命令鏈表
INIT_LIST_HEAD(&shost->starved_list); //初始化scsi命令饑餓鏈表
init_waitqueue_head(&shost->host_wait);
mutex_init(&shost->scan_mutex);
/*
* subtract one because we increment first then return, but we need to
* know what the next host number was before increment
*/ //遞增分配主機適配器號
shost->host_no = atomic_inc_return(&scsi_host_next_hn) - 1;
shost->dma_channel = 0xff;
/* These three are default values which can be overridden */
shost->max_channel = 0; //默認通道號為0
shost->max_id = 8; //默認target最大數(shù)量
shost->max_lun = 8; //默認scsi_device最大數(shù)量
/* Give each shost a default transportt */
shost->transportt = &blank_transport_template; //scsi傳輸層(中間層)模板
/*
* All drivers right now should be able to handle 12 byte
* commands. Every so often there are requests for 16 byte
* commands, but individual low-level drivers need to certify that
* they actually do something sensible with such commands.
*/
shost->max_cmd_len = 12; //最長的SCSI命令長度
shost->hostt = sht; //使用主機適配器模板
shost->this_id = sht->this_id;
shost->can_queue = sht->can_queue;
shost->sg_tablesize = sht->sg_tablesize;
shost->sg_prot_tablesize = sht->sg_prot_tablesize;
shost->cmd_per_lun = sht->cmd_per_lun;
shost->unchecked_isa_dma = sht->unchecked_isa_dma;
shost->use_clustering = sht->use_clustering;
shost->no_write_same = sht->no_write_same;
if (shost_eh_deadline == -1 || !sht->eh_host_reset_handler)
shost->eh_deadline = -1;
else if ((ulong) shost_eh_deadline * HZ > INT_MAX) {
shost_printk(KERN_WARNING, shost,
"eh_deadline %u too large, setting to %u\n",
shost_eh_deadline, INT_MAX / HZ);
shost->eh_deadline = INT_MAX;
} else
shost->eh_deadline = shost_eh_deadline * HZ;
if (sht->supported_mode == MODE_UNKNOWN) //由模板指定HBA的模式
/* means we didn't set it ... default to INITIATOR */
shost->active_mode = MODE_INITIATOR; //主機適配器模式默認是initiator
else
shost->active_mode = sht->supported_mode;
if (sht->max_host_blocked)
shost->max_host_blocked = sht->max_host_blocked;
else
shost->max_host_blocked = SCSI_DEFAULT_HOST_BLOCKED;
/*
* If the driver imposes no hard sector transfer limit, start at
* machine infinity initially.
*/
if (sht->max_sectors)
shost->max_sectors = sht->max_sectors;
else
shost->max_sectors = SCSI_DEFAULT_MAX_SECTORS;
/*
* assume a 4GB boundary, if not set
*/
if (sht->dma_boundary)
shost->dma_boundary = sht->dma_boundary;
else
shost->dma_boundary = 0xffffffff; //默認DMA的邊界為4G
shost->use_blk_mq = scsi_use_blk_mq && !shost->hostt->disable_blk_mq;
device_initialize(&shost->shost_gendev); //初始化主機適配器內(nèi)部通用設(shè)備
dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
shost->shost_gendev.bus = &scsi_bus_type; //設(shè)置主機適配器的總線類型
shost->shost_gendev.type = &scsi_host_type; //設(shè)置主機適配器的設(shè)備類型
device_initialize(&shost->shost_dev); //初始化主機適配器的內(nèi)部類設(shè)備
shost->shost_dev.parent = &shost->shost_gendev; //內(nèi)部類設(shè)備的父設(shè)備設(shè)置為其內(nèi)部通用設(shè)備
shost->shost_dev.class = &shost_class; //設(shè)置內(nèi)部類設(shè)備所屬的類是shost_class
dev_set_name(&shost->shost_dev, "host%d", shost->host_no);
shost->shost_dev.groups = scsi_sysfs_shost_attr_groups; //設(shè)置類設(shè)備的屬性組
shost->ehandler = kthread_run(scsi_error_handler, shost, //啟動主機適配器的錯誤恢復內(nèi)核線程
"scsi_eh_%d", shost->host_no);
if (IS_ERR(shost->ehandler)) {
shost_printk(KERN_WARNING, shost,
"error handler thread failed to spawn, error = %ld\n",
PTR_ERR(shost->ehandler));
goto fail_kfree;
}
//分配任務(wù)管理工作隊列
shost->tmf_work_q = alloc_workqueue("scsi_tmf_%d",
WQ_UNBOUND | WQ_MEM_RECLAIM,
1, shost->host_no);
if (!shost->tmf_work_q) {
shost_printk(KERN_WARNING, shost,
"failed to create tmf workq\n");
goto fail_kthread;
}
scsi_proc_hostdir_add(shost->hostt); //在procfs中添加主機適配器的目錄, eg. //創(chuàng)建/proc/scsi/<主機適配器名稱>目錄
return shost;
fail_kthread:
kthread_stop(shost->ehandler);
fail_kfree:
kfree(shost);
return NULL;
}
EXPORT_SYMBOL(scsi_host_alloc);static inline int __must_check scsi_add_host(struct Scsi_Host *host,
struct device *dev) //dev為父設(shè)備
{
return scsi_add_host_with_dma(host, dev, dev);
}
int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
struct device *dma_dev)
{
struct scsi_host_template *sht = shost->hostt;
int error = -EINVAL;
shost_printk(KERN_INFO, shost, "%s\n",
sht->info ? sht->info(shost) : sht->name);
if (!shost->can_queue) {
shost_printk(KERN_ERR, shost,
"can_queue = 0 no longer supported\n");
goto fail;
}
if (shost_use_blk_mq(shost)) { //如果主機適配器設(shè)置使用多隊列IO,則建立
error = scsi_mq_setup_tags(shost); //相應的多隊列環(huán)境
if (error)
goto fail;
} else {
shost->bqt = blk_init_tags(shost->can_queue,
shost->hostt->tag_alloc_policy);
if (!shost->bqt) {
error = -ENOMEM;
goto fail;
}
}
/*
* Note that we allocate the freelist even for the MQ case for now,
* as we need a command set aside for scsi_reset_provider. Having
* the full host freelist and one command available for that is a
* little heavy-handed, but avoids introducing a special allocator
* just for this. Eventually the structure of scsi_reset_provider
* will need a major overhaul.
*/ //分配存儲scsi命令和sense數(shù)據(jù)的緩沖區(qū), 并分配scsi命令的備用倉庫鏈表
error = scsi_setup_command_freelist(shost);
if (error)
goto out_destroy_tags;
//設(shè)置主機適配器的父設(shè)備,確定該設(shè)備在sysfs中的位置,通常會通過dev參數(shù)傳入pci_dev。
if (!shost->shost_gendev.parent)
shost->shost_gendev.parent = dev ? dev : &platform_bus; //如果dev為NULL,設(shè)置為platform_bus
if (!dma_dev)
dma_dev = shost->shost_gendev.parent;
shost->dma_dev = dma_dev;
error = device_add(&shost->shost_gendev); //添加主機適配器通用設(shè)備到系統(tǒng)
if (error)
goto out_destroy_freelist;
pm_runtime_set_active(&shost->shost_gendev);
pm_runtime_enable(&shost->shost_gendev);
device_enable_async_suspend(&shost->shost_gendev); //支持異步掛起通用設(shè)備
scsi_host_set_state(shost, SHOST_RUNNING); //設(shè)置主機適配器狀態(tài)
get_device(shost->shost_gendev.parent); //增加通用父設(shè)備的引用計數(shù)
device_enable_async_suspend(&shost->shost_dev); //支持異步掛起類設(shè)備
error = device_add(&shost->shost_dev); //添加主機適配器類設(shè)備到系統(tǒng)
if (error)
goto out_del_gendev;
get_device(&shost->shost_gendev);
if (shost->transportt->host_size) { //scsi傳輸層使用的數(shù)據(jù)空間
shost->shost_data = kzalloc(shost->transportt->host_size,
GFP_KERNEL);
if (shost->shost_data == NULL) {
error = -ENOMEM;
goto out_del_dev;
}
}
if (shost->transportt->create_work_queue) {
snprintf(shost->work_q_name, sizeof(shost->work_q_name),
"scsi_wq_%d", shost->host_no);
shost->work_q = create_singlethread_workqueue( //分配被scsi傳輸層使用的工作隊列
shost->work_q_name);
if (!shost->work_q) {
error = -EINVAL;
goto out_free_shost_data;
}
}
error = scsi_sysfs_add_host(shost); //添加主機適配器到子系統(tǒng)
if (error)
goto out_destroy_host;
scsi_proc_host_add(shost); //在procfs添加主機適配器信息
return error;
out_destroy_host:
if (shost->work_q)
destroy_workqueue(shost->work_q);
out_free_shost_data:
kfree(shost->shost_data);
out_del_dev:
device_del(&shost->shost_dev);
out_del_gendev:
device_del(&shost->shost_gendev);
out_destroy_freelist:
scsi_destroy_command_freelist(shost);
out_destroy_tags:
if (shost_use_blk_mq(shost))
scsi_mq_destroy_tags(shost);
fail:
return error;
}
EXPORT_SYMBOL(scsi_add_host_with_dma);設(shè)備探測過程在系統(tǒng)啟動過程中,會掃描默認的PCI根總線,從而觸發(fā)了PCI設(shè)備掃描的過程,開始構(gòu)造PCI設(shè)備樹,SCSI主機適配器是掛載在PCI總線的設(shè)備。SCSI主機適配器做PCI設(shè)備會被PCI總線驅(qū)動層掃描到(PCI設(shè)備的掃描采用配置空間訪問的方式),掃描到SCSI主機適配器后,操作系統(tǒng)開始加載SCSI主機適配器驅(qū)動,SCSI主機適配器驅(qū)動就是上面所說的低層驅(qū)動。SCSI主機適配器驅(qū)動根據(jù)SCSI主機適配器驅(qū)動根據(jù)SCSI主機適配模板分配SCSI主機適配器描述符,并添加到系統(tǒng),之后啟動通過SCSI主機適配器擴展出來的下一級總線–SCSI總線的掃描過程。 SCSI中間層依次以可能的ID和LUN構(gòu)造INQUIRY命令,之后將這些INQUIRY命令提交給塊IO子系統(tǒng),后者又最終將調(diào)用SCSI中間層的策略例程,再次提取到SCSI命令結(jié)構(gòu)后,調(diào)用SCSI低層驅(qū)動的queuecommand回調(diào)函數(shù)實現(xiàn)。 SCSI總線具體的掃描方式可以由具體的主機適配器固件、主機適配器驅(qū)動實現(xiàn),在此只討論由主機適配器驅(qū)動調(diào)用scsi中間層提供通用的掃描函數(shù)的實現(xiàn)方式scsi_scan_host。 void scsi_scan_host(struct Scsi_Host *shost)
{
struct async_scan_data *data;
if (strncmp(scsi_scan_type, "none", 4) == 0) //檢查掃描邏輯
return;
if (scsi_autopm_get_host(shost) < 0)
return;
data = scsi_prep_async_scan(shost); //準備異步掃描
if (!data) {
do_scsi_scan_host(shost); //同步掃描
scsi_autopm_put_host(shost);
return;
}
/* register with the async subsystem so wait_for_device_probe()
* will flush this work
*/
async_schedule(do_scan_async, data); //異步掃描
/* scsi_autopm_put_host(shost) is called in scsi_finish_async_scan() */
}
EXPORT_SYMBOL(scsi_scan_host);scsi_scan_host函數(shù)是scsi中間層提供的主機適配器掃描函數(shù),對于有主機適配器驅(qū)動有自定義掃描邏輯需求的可以設(shè)置主機適配器模板的回調(diào)函數(shù),由scsi_scan_host函數(shù)來調(diào)用回調(diào)實現(xiàn)自定義掃描。 static void do_scsi_scan_host(struct Scsi_Host *shost)
{
if (shost->hostt->scan_finished) { //使用自定義掃描方式
unsigned long start = jiffies;
if (shost->hostt->scan_start)
shost->hostt->scan_start(shost); //自定義掃描開始回調(diào)
while (!shost->hostt->scan_finished(shost, jiffies - start)) //自定義掃描完成時返回1
msleep(10);
} else { //scsi子系統(tǒng)通用掃描函數(shù), SCAN_WILD_CARD表示掃描所有的target和device
scsi_scan_host_selected(shost, SCAN_WILD_CARD, SCAN_WILD_CARD,
SCAN_WILD_CARD, 0);
}
}如果主機適配器模板設(shè)置了自定義掃描函數(shù),do_scsi_scan_host函數(shù)將會調(diào)用。如果沒有設(shè)置則使用默認的掃描函數(shù)scsi_scan_host_selected執(zhí)行掃描。 int scsi_scan_host_selected(struct Scsi_Host *shost, unsigned int channel,
unsigned int id, u64 lun, int rescan)
{
SCSI_LOG_SCAN_BUS(3, shost_printk (KERN_INFO, shost,
"%s: <%u:%u:%llu>\n",
__func__, channel, id, lun));
//檢查channel、id、lun是否有效
if (((channel != SCAN_WILD_CARD) && (channel > shost->max_channel)) ||
((id != SCAN_WILD_CARD) && (id >= shost->max_id)) ||
((lun != SCAN_WILD_CARD) && (lun >= shost->max_lun)))
return -EINVAL;
mutex_lock(&shost->scan_mutex);
if (!shost->async_scan)
scsi_complete_async_scans();
//檢查Scsi_Host的狀態(tài)是否允許掃描
if (scsi_host_scan_allowed(shost) && scsi_autopm_get_host(shost) == 0) {
if (channel == SCAN_WILD_CARD)
for (channel = 0; channel <= shost->max_channel; //遍歷所有的channel進行掃描
channel++)
scsi_scan_channel(shost, channel, id, lun, //掃描channel
rescan);
else
scsi_scan_channel(shost, channel, id, lun, rescan); //掃描指定的channel
scsi_autopm_put_host(shost);
}
mutex_unlock(&shost->scan_mutex);
return 0;
}scsi_scan_host_selected函數(shù)掃描指定的主機適配器,根據(jù)輸入的參數(shù)決定是否遍歷掃描所有channel或掃描指定channel,通過函數(shù)scsi_scan_channel完成。 static void scsi_scan_channel(struct Scsi_Host *shost, unsigned int channel,
unsigned int id, u64 lun, int rescan)
{
uint order_id;
if (id == SCAN_WILD_CARD)
for (id = 0; id < shost->max_id; ++id) { //遍歷所有的target
/*
* XXX adapter drivers when possible (FCP, iSCSI)
* could modify max_id to match the current max,
* not the absolute max.
*
* XXX add a shost id iterator, so for example,
* the FC ID can be the same as a target id
* without a huge overhead of sparse id's.
*/
if (shost->reverse_ordering)
/*
* Scan from high to low id.
*/
order_id = shost->max_id - id - 1;
else
order_id = id;
__scsi_scan_target(&shost->shost_gendev, channel, //掃描指定的target
order_id, lun, rescan);
}
else
__scsi_scan_target(&shost->shost_gendev, channel,
id, lun, rescan);
}__scsi_scan_target函數(shù)指定掃描target內(nèi)部的lun。 static void __scsi_scan_target(struct device *parent, unsigned int channel,
unsigned int id, u64 lun, int rescan)
{
struct Scsi_Host *shost = dev_to_shost(parent);
int bflags = 0;
int res;
struct scsi_target *starget;
if (shost->this_id == id)
/*
* Don't scan the host adapter
*/
return;
//為指定的id分配target數(shù)據(jù)結(jié)構(gòu),并初始化
starget = scsi_alloc_target(parent, channel, id);
if (!starget)
return;
scsi_autopm_get_target(starget);
if (lun != SCAN_WILD_CARD) {
/*
* Scan for a specific host/chan/id/lun.
*/ //掃描target中指定id的scsi_device(lun),并將scsi_device(lun)添加到子系統(tǒng)
scsi_probe_and_add_lun(starget, lun, NULL, NULL, rescan, NULL);
goto out_reap;
}
/*
* Scan LUN 0, if there is some response, scan further. Ideally, we
* would not configure LUN 0 until all LUNs are scanned.
*/ //探測target的LUN0
res = scsi_probe_and_add_lun(starget, 0, &bflags, NULL, rescan, NULL);
if (res == SCSI_SCAN_LUN_PRESENT || res == SCSI_SCAN_TARGET_PRESENT) {
if (scsi_report_lun_scan(starget, bflags, rescan) != 0) //向target lun 0發(fā)送REPORT_LUNS
/*
* The REPORT LUN did not scan the target,
* do a sequential scan.
*/
scsi_sequential_lun_scan(starget, bflags, //探測REPORT_LUNS上報的lun
starget->scsi_level, rescan);
}
out_reap:
scsi_autopm_put_target(starget);
/*
* paired with scsi_alloc_target(): determine if the target has
* any children at all and if not, nuke it
*/
scsi_target_reap(starget);
put_device(&starget->dev);
}掃描到target時分配并初始化scsi_target結(jié)構(gòu),scsi_probe_and_add_lun函數(shù)完成探測target中的lun,并將發(fā)現(xiàn)的lun添加到系統(tǒng)。 static int scsi_probe_and_add_lun(struct scsi_target *starget,
u64 lun, int *bflagsp,
struct scsi_device **sdevp, int rescan,
void *hostdata)
{
struct scsi_device *sdev;
unsigned char *result;
int bflags, res = SCSI_SCAN_NO_RESPONSE, result_len = 256;
struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);
/*
* The rescan flag is used as an optimization, the first scan of a
* host adapter calls into here with rescan == 0.
*/
sdev = scsi_device_lookup_by_target(starget, lun); //尋找target中指定id的lun
if (sdev) { //target中已經(jīng)存在lun
if (rescan || !scsi_device_created(sdev)) { //rescan參數(shù)要求重新掃描該lun
SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,
"scsi scan: device exists on %s\n",
dev_name(&sdev->sdev_gendev)));
if (sdevp)
*sdevp = sdev;
else
scsi_device_put(sdev);
if (bflagsp)
*bflagsp = scsi_get_device_flags(sdev,
sdev->vendor,
sdev->model);
return SCSI_SCAN_LUN_PRESENT;
}
scsi_device_put(sdev);
} else
sdev = scsi_alloc_sdev(starget, lun, hostdata); //target中不存在lun,分配scsi_device
if (!sdev)
goto out;
result = kmalloc(result_len, GFP_ATOMIC |
((shost->unchecked_isa_dma) ? __GFP_DMA : 0));
if (!result)
goto out_free_sdev;
if (scsi_probe_lun(sdev, result, result_len, &bflags)) //發(fā)送INQUIRY到具體device,進行探測
goto out_free_result;
if (bflagsp)
*bflagsp = bflags;
/*
* result contains valid SCSI INQUIRY data.
*/
if (((result[0] >> 5) == 3) && !(bflags & BLIST_ATTACH_PQ3)) {
/*
* For a Peripheral qualifier 3 (011b), the SCSI
* spec says: The device server is not capable of
* supporting a physical device on this logical
* unit.
*
* For disks, this implies that there is no
* logical disk configured at sdev->lun, but there
* is a target id responding.
*/
SCSI_LOG_SCAN_BUS(2, sdev_printk(KERN_INFO, sdev, "scsi scan:"
" peripheral qualifier of 3, device not"
" added\n"))
if (lun == 0) {
SCSI_LOG_SCAN_BUS(1, {
unsigned char vend[9];
unsigned char mod[17];
sdev_printk(KERN_INFO, sdev,
"scsi scan: consider passing scsi_mod."
"dev_flags=%s:%s:0x240 or 0x1000240\n",
scsi_inq_str(vend, result, 8, 16),
scsi_inq_str(mod, result, 16, 32));
});
}
res = SCSI_SCAN_TARGET_PRESENT;
goto out_free_result;
}
/*
* Some targets may set slight variations of PQ and PDT to signal
* that no LUN is present, so don't add sdev in these cases.
* Two specific examples are:
* 1) NetApp targets: return PQ=1, PDT=0x1f
* 2) USB UFI: returns PDT=0x1f, with the PQ bits being "reserved"
* in the UFI 1.0 spec (we cannot rely on reserved bits).
*
* References:
* 1) SCSI SPC-3, pp. 145-146
* PQ=1: "A peripheral device having the specified peripheral
* device type is not connected to this logical unit. However, the
* device server is capable of supporting the specified peripheral
* device type on this logical unit."
* PDT=0x1f: "Unknown or no device type"
* 2) USB UFI 1.0, p. 20
* PDT=00h Direct-access device (floppy)
* PDT=1Fh none (no FDD connected to the requested logical unit)
*/
if (((result[0] >> 5) == 1 || starget->pdt_1f_for_no_lun) &&
(result[0] & 0x1f) == 0x1f &&
!scsi_is_wlun(lun)) {
SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,
"scsi scan: peripheral device type"
" of 31, no device added\n"));
res = SCSI_SCAN_TARGET_PRESENT;
goto out_free_result;
}
//添加scsi設(shè)備到子系統(tǒng)
res = scsi_add_lun(sdev, result, &bflags, shost->async_scan);
if (res == SCSI_SCAN_LUN_PRESENT) {
if (bflags & BLIST_KEY) {
sdev->lockable = 0;
scsi_unlock_floptical(sdev, result);
}
}
out_free_result:
kfree(result);
out_free_sdev:
if (res == SCSI_SCAN_LUN_PRESENT) {
if (sdevp) {
if (scsi_device_get(sdev) == 0) {
*sdevp = sdev;
} else {
__scsi_remove_device(sdev);
res = SCSI_SCAN_NO_RESPONSE;
}
}
} else
__scsi_remove_device(sdev);
out:
return res;
}scsi_probe_and_add_lun函數(shù)由名字可知,完成lun的probe和add兩個操作: static int scsi_probe_lun(struct scsi_device *sdev, unsigned char *inq_result,
int result_len, int *bflags)
{
unsigned char scsi_cmd[MAX_COMMAND_SIZE];
int first_inquiry_len, try_inquiry_len, next_inquiry_len;
int response_len = 0;
int pass, count, result;
struct scsi_sense_hdr sshdr;
*bflags = 0;
/* Perform up to 3 passes. The first pass uses a conservative
* transfer length of 36 unless sdev->inquiry_len specifies a
* different value. */
first_inquiry_len = sdev->inquiry_len ? sdev->inquiry_len : 36;
try_inquiry_len = first_inquiry_len;
pass = 1;
next_pass:
SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,
"scsi scan: INQUIRY pass %d length %d\n",
pass, try_inquiry_len));
/* Each pass gets up to three chances to ignore Unit Attention */
for (count = 0; count < 3; ++count) {
int resid;
memset(scsi_cmd, 0, 6);
scsi_cmd[0] = INQUIRY; //命令類型是INQUIRY
scsi_cmd[4] = (unsigned char) try_inquiry_len;
memset(inq_result, 0, try_inquiry_len);
//發(fā)送SCSI命令,重試3次
result = scsi_execute_req(sdev, scsi_cmd, DMA_FROM_DEVICE,
inq_result, try_inquiry_len, &sshdr,
HZ / 2 + HZ * scsi_inq_timeout, 3,
&resid);
SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,
"scsi scan: INQUIRY %s with code 0x%x\n",
result ? "failed" : "successful", result));
if (result) {
/*
* not-ready to ready transition [asc/ascq=0x28/0x0]
* or power-on, reset [asc/ascq=0x29/0x0], continue.
* INQUIRY should not yield UNIT_ATTENTION
* but many buggy devices do so anyway.
*/
if ((driver_byte(result) & DRIVER_SENSE) &&
scsi_sense_valid(&sshdr)) {
if ((sshdr.sense_key == UNIT_ATTENTION) &&
((sshdr.asc == 0x28) ||
(sshdr.asc == 0x29)) &&
(sshdr.ascq == 0))
continue;
}
} else {
/*
* if nothing was transferred, we try
* again. It's a workaround for some USB
* devices.
*/
if (resid == try_inquiry_len)
continue;
}
break;
}
if (result == 0) {
sanitize_inquiry_string(&inq_result[8], 8);
sanitize_inquiry_string(&inq_result[16], 16);
sanitize_inquiry_string(&inq_result[32], 4);
response_len = inq_result[4] + 5;
if (response_len > 255)
response_len = first_inquiry_len; /* sanity */
/*
* Get any flags for this device.
*
* XXX add a bflags to scsi_device, and replace the
* corresponding bit fields in scsi_device, so bflags
* need not be passed as an argument.
*/
*bflags = scsi_get_device_flags(sdev, &inq_result[8],
&inq_result[16]);
/* When the first pass succeeds we gain information about
* what larger transfer lengths might work. */
if (pass == 1) {
if (BLIST_INQUIRY_36 & *bflags)
next_inquiry_len = 36;
else if (BLIST_INQUIRY_58 & *bflags)
next_inquiry_len = 58;
else if (sdev->inquiry_len)
next_inquiry_len = sdev->inquiry_len;
else
next_inquiry_len = response_len;
/* If more data is available perform the second pass */
if (next_inquiry_len > try_inquiry_len) {
try_inquiry_len = next_inquiry_len;
pass = 2;
goto next_pass;
}
}
} else if (pass == 2) {
sdev_printk(KERN_INFO, sdev,
"scsi scan: %d byte inquiry failed. "
"Consider BLIST_INQUIRY_36 for this device\n",
try_inquiry_len);
/* If this pass failed, the third pass goes back and transfers
* the same amount as we successfully got in the first pass. */
try_inquiry_len = first_inquiry_len;
pass = 3;
goto next_pass;
}
/* If the last transfer attempt got an error, assume the
* peripheral doesn't exist or is dead. */
if (result)
return -EIO;
/* Don't report any more data than the device says is valid */
sdev->inquiry_len = min(try_inquiry_len, response_len);
/*
* XXX Abort if the response length is less than 36? If less than
* 32, the lookup of the device flags (above) could be invalid,
* and it would be possible to take an incorrect action - we do
* not want to hang because of a short INQUIRY. On the flip side,
* if the device is spun down or becoming ready (and so it gives a
* short INQUIRY), an abort here prevents any further use of the
* device, including spin up.
*
* On the whole, the best approach seems to be to assume the first
* 36 bytes are valid no matter what the device says. That's
* better than copying < 36 bytes to the inquiry-result buffer
* and displaying garbage for the Vendor, Product, or Revision
* strings.
*/
if (sdev->inquiry_len < 36) {
if (!sdev->host->short_inquiry) {
shost_printk(KERN_INFO, sdev->host,
"scsi scan: INQUIRY result too short (%d),"
" using 36\n", sdev->inquiry_len);
sdev->host->short_inquiry = 1;
}
sdev->inquiry_len = 36;
}
/*
* Related to the above issue:
*
* XXX Devices (disk or all?) should be sent a TEST UNIT READY,
* and if not ready, sent a START_STOP to start (maybe spin up) and
* then send the INQUIRY again, since the INQUIRY can change after
* a device is initialized.
*
* Ideally, start a device if explicitly asked to do so. This
* assumes that a device is spun up on power on, spun down on
* request, and then spun up on request.
*/
/*
* The scanning code needs to know the scsi_level, even if no
* device is attached at LUN 0 (SCSI_SCAN_TARGET_PRESENT) so
* non-zero LUNs can be scanned.
*/
sdev->scsi_level = inq_result[2] & 0x07;
if (sdev->scsi_level >= 2 ||
(sdev->scsi_level == 1 && (inq_result[3] & 0x0f) == 1))
sdev->scsi_level++;
sdev->sdev_target->scsi_level = sdev->scsi_level;
/*
* If SCSI-2 or lower, and if the transport requires it,
* store the LUN value in CDB[1].
*/
sdev->lun_in_cdb = 0;
if (sdev->scsi_level <= SCSI_2 &&
sdev->scsi_level != SCSI_UNKNOWN &&
!sdev->host->no_scsi2_lun_in_cdb)
sdev->lun_in_cdb = 1;
return 0;
}
static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
int *bflags, int async)
{
int ret;
/*
* XXX do not save the inquiry, since it can change underneath us,
* save just vendor/model/rev.
*
* Rather than save it and have an ioctl that retrieves the saved
* value, have an ioctl that executes the same INQUIRY code used
* in scsi_probe_lun, let user level programs doing INQUIRY
* scanning run at their own risk, or supply a user level program
* that can correctly scan.
*/
/*
* Copy at least 36 bytes of INQUIRY data, so that we don't
* dereference unallocated memory when accessing the Vendor,
* Product, and Revision strings. Badly behaved devices may set
* the INQUIRY Additional Length byte to a small value, indicating
* these strings are invalid, but often they contain plausible data
* nonetheless. It doesn't matter if the device sent < 36 bytes
* total, since scsi_probe_lun() initializes inq_result with 0s.
*/
sdev->inquiry = kmemdup(inq_result,
max_t(size_t, sdev->inquiry_len, 36),
GFP_ATOMIC);
if (sdev->inquiry == NULL)
return SCSI_SCAN_NO_RESPONSE;
sdev->vendor = (char *) (sdev->inquiry + 8); //第8個字節(jié)到第15個字節(jié)是vendor identification
sdev->model = (char *) (sdev->inquiry + 16); //第16個字節(jié)到第31個字節(jié)是product identification
sdev->rev = (char *) (sdev->inquiry + 32); //第32個字節(jié)到第35個字節(jié)是product revision level
if (strncmp(sdev->vendor, "ATA ", 8) == 0) {
/*
* sata emulation layer device. This is a hack to work around
* the SATL power management specifications which state that
* when the SATL detects the device has gone into standby
* mode, it shall respond with NOT READY.
*/
sdev->allow_restart = 1;
}
if (*bflags & BLIST_ISROM) {
sdev->type = TYPE_ROM;
sdev->removable = 1;
} else {
sdev->type = (inq_result[0] & 0x1f);
sdev->removable = (inq_result[1] & 0x80) >> 7;
/*
* some devices may respond with wrong type for
* well-known logical units. Force well-known type
* to enumerate them correctly.
*/
if (scsi_is_wlun(sdev->lun) && sdev->type != TYPE_WLUN) {
sdev_printk(KERN_WARNING, sdev,
"%s: correcting incorrect peripheral device type 0x%x for W-LUN 0x%16xhN\n",
__func__, sdev->type, (unsigned int)sdev->lun);
sdev->type = TYPE_WLUN;
}
}
if (sdev->type == TYPE_RBC || sdev->type == TYPE_ROM) {
/* RBC and MMC devices can return SCSI-3 compliance and yet
* still not support REPORT LUNS, so make them act as
* BLIST_NOREPORTLUN unless BLIST_REPORTLUN2 is
* specifically set */
if ((*bflags & BLIST_REPORTLUN2) == 0)
*bflags |= BLIST_NOREPORTLUN;
}
/*
* For a peripheral qualifier (PQ) value of 1 (001b), the SCSI
* spec says: The device server is capable of supporting the
* specified peripheral device type on this logical unit. However,
* the physical device is not currently connected to this logical
* unit.
*
* The above is vague, as it implies that we could treat 001 and
* 011 the same. Stay compatible with previous code, and create a
* scsi_device for a PQ of 1
*
* Don't set the device offline here; rather let the upper
* level drivers eval the PQ to decide whether they should
* attach. So remove ((inq_result[0] >> 5) & 7) == 1 check.
*/
sdev->inq_periph_qual = (inq_result[0] >> 5) & 7;
sdev->lockable = sdev->removable;
sdev->soft_reset = (inq_result[7] & 1) && ((inq_result[3] & 7) == 2);
if (sdev->scsi_level >= SCSI_3 ||
(sdev->inquiry_len > 56 && inq_result[56] & 0x04))
sdev->ppr = 1;
if (inq_result[7] & 0x60)
sdev->wdtr = 1;
if (inq_result[7] & 0x10)
sdev->sdtr = 1;
sdev_printk(KERN_NOTICE, sdev, "%s %.8s %.16s %.4s PQ: %d "
"ANSI: %d%s\n", scsi_device_type(sdev->type),
sdev->vendor, sdev->model, sdev->rev,
sdev->inq_periph_qual, inq_result[2] & 0x07,
(inq_result[3] & 0x0f) == 1 ? " CCS" : "");
if ((sdev->scsi_level >= SCSI_2) && (inq_result[7] & 2) &&
!(*bflags & BLIST_NOTQ)) {
sdev->tagged_supported = 1;
sdev->simple_tags = 1;
}
/*
* Some devices (Texel CD ROM drives) have handshaking problems
* when used with the Seagate controllers. borken is initialized
* to 1, and then set it to 0 here.
*/
if ((*bflags & BLIST_BORKEN) == 0)
sdev->borken = 0;
if (*bflags & BLIST_NO_ULD_ATTACH)
sdev->no_uld_attach = 1;
/*
* Apparently some really broken devices (contrary to the SCSI
* standards) need to be selected without asserting ATN
*/
if (*bflags & BLIST_SELECT_NO_ATN)
sdev->select_no_atn = 1;
/*
* Maximum 512 sector transfer length
* broken RA4x00 Compaq Disk Array
*/
if (*bflags & BLIST_MAX_512)
blk_queue_max_hw_sectors(sdev->request_queue, 512);
/*
* Max 1024 sector transfer length for targets that report incorrect
* max/optimal lengths and relied on the old block layer safe default
*/
else if (*bflags & BLIST_MAX_1024)
blk_queue_max_hw_sectors(sdev->request_queue, 1024);
/*
* Some devices may not want to have a start command automatically
* issued when a device is added.
*/
if (*bflags & BLIST_NOSTARTONADD)
sdev->no_start_on_add = 1;
if (*bflags & BLIST_SINGLELUN)
scsi_target(sdev)->single_lun = 1;
sdev->use_10_for_rw = 1;
if (*bflags & BLIST_MS_SKIP_PAGE_08)
sdev->skip_ms_page_8 = 1;
if (*bflags & BLIST_MS_SKIP_PAGE_3F)
sdev->skip_ms_page_3f = 1;
if (*bflags & BLIST_USE_10_BYTE_MS)
sdev->use_10_for_ms = 1;
/* some devices don't like REPORT SUPPORTED OPERATION CODES
* and will simply timeout causing sd_mod init to take a very
* very long time */
if (*bflags & BLIST_NO_RSOC)
sdev->no_report_opcodes = 1;
/* set the device running here so that slave configure
* may do I/O */
ret = scsi_device_set_state(sdev, SDEV_RUNNING); //狀態(tài)
if (ret) {
ret = scsi_device_set_state(sdev, SDEV_BLOCK);
if (ret) {
sdev_printk(KERN_ERR, sdev,
"in wrong state %s to complete scan\n",
scsi_device_state_name(sdev->sdev_state));
return SCSI_SCAN_NO_RESPONSE;
}
}
if (*bflags & BLIST_MS_192_BYTES_FOR_3F)
sdev->use_192_bytes_for_3f = 1;
if (*bflags & BLIST_NOT_LOCKABLE)
sdev->lockable = 0;
if (*bflags & BLIST_RETRY_HWERROR)
sdev->retry_hwerror = 1;
if (*bflags & BLIST_NO_DIF)
sdev->no_dif = 1;
sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT;
if (*bflags & BLIST_TRY_VPD_PAGES)
sdev->try_vpd_pages = 1;
else if (*bflags & BLIST_SKIP_VPD_PAGES)
sdev->skip_vpd_pages = 1;
transport_configure_device(&sdev->sdev_gendev); //把lun配置到scsi傳輸層
if (sdev->host->hostt->slave_configure) {
ret = sdev->host->hostt->slave_configure(sdev); //主機適配器模板設(shè)置的回調(diào),對scsi_device(lun)執(zhí)行特定的初始化
if (ret) {
/*
* if LLDD reports slave not present, don't clutter
* console with alloc failure messages
*/
if (ret != -ENXIO) {
sdev_printk(KERN_ERR, sdev,
"failed to configure device\n");
}
return SCSI_SCAN_NO_RESPONSE;
}
}
if (sdev->scsi_level >= SCSI_3)
scsi_attach_vpd(sdev);
sdev->max_queue_depth = sdev->queue_depth; //設(shè)置最大隊列深度
/*
* Ok, the device is now all set up, we can
* register it and tell the rest of the kernel
* about it.
*/ //添加scsi_device(lun)到sysfs
if (!async && scsi_sysfs_add_sdev(sdev) != 0)
return SCSI_SCAN_NO_RESPONSE;
return SCSI_SCAN_LUN_PRESENT;
}
|
|
|