insmod 入口函数
本文用到的 busybox 版本为 1.34.1,Linux 内核版本为 4.14.294
insmod_main()
函数是 insmod 命令的入口函数,该函数首先通过函数参数获取被加载模块的名字并存入局部指针变量 filename,然后调用bb_init_module()
函数进行后续操作。
int insmod_main(int argc UNUSED_PARAM, char **argv)
{
char *filename;
int rc;
/* Compat note:
* 2.6 style insmod has no options and required filename
* (not module name - .ko can't be omitted).
* 2.4 style insmod can take module name without .o
* and performs module search in default directories
* or in $MODPATH.
*/
IF_FEATURE_2_4_MODULES(
getopt32(argv, INSMOD_OPTS INSMOD_ARGS);
argv += optind - 1;
);
filename = *++argv;
if (!filename)
bb_show_usage();
rc = bb_init_module(filename, parse_cmdline_module_options(argv, /*quote_spaces:*/ 0));
if (rc)
bb_error_msg("can't insert '%s': %s", filename, moderror(rc));
return rc;
}
模块参数解析函数
parse_cmdline_module_options()
函数会解析模块加载时传给模块的参数,通过while
循环挨个解析模块后面传给模块的参数,并将解析出来的参数值val
存入指针变量options
指向的内存空间,最后返回该内存空间的首地址。
char* FAST_FUNC parse_cmdline_module_options(char **argv, int quote_spaces)
{
char *options;
int optlen;
options = xzalloc(1);
optlen = 0;
while (*++argv) {
const char *fmt;
const char *var;
const char *val;
var = *argv;
options = xrealloc(options, optlen + 2 + strlen(var) + 2);
fmt = "%.*s%s ";
val = strchrnul(var, '=');
if (quote_spaces) {
/*
* modprobe (module-init-tools version 3.11.1) compat:
* quote only value:
* var="val with spaces", not "var=val with spaces"
* (note: var *name* is not checked for spaces!)
*/
if (*val) { /* has var=val format. skip '=' */
val++;
if (strchr(val, ' '))
fmt = "%.*s\"%s\" ";
}
}
optlen += sprintf(options + optlen, fmt, (int)(val - var), var, val);
}
/* Remove trailing space. Disabled */
/* if (optlen != 0) options[optlen-1] = '\\0'; */
return options;
}
映射模块文件
bb_init_module()
函数首先判断模块有没有参数传入,调用try_to_mmap_module()
函数完成后续映射工作,该函数接收两个参数:被加载模块的名字(filename),模块的大小(image_size)作为出参参数传入。最后调用init_module()
函数,init_module()
函数是系统调用函数,对应的内核函数是sys_init_module()
函数,进入到内核空间。传入的参数分别是:模块内存空间首地址(image),模块大小(image_size),模块参数内存空间首地址(options)。
int FAST_FUNC bb_init_module(const char *filename, const char *options)
{
size_t image_size;
char *image;
int rc;
bool mmaped;
if (!options)
options = "";
//TODO: audit bb_init_module_24 to match error code convention
#if ENABLE_FEATURE_2_4_MODULES
if (get_linux_version_code() < KERNEL_VERSION(2,6,0))
return bb_init_module_24(filename, options);
#endif
/*
* First we try finit_module if available. Some kernels are configured
* to only allow loading of modules off of secure storage (like a read-
* only rootfs) which needs the finit_module call. If it fails, we fall
* back to normal module loading to support compressed modules.
*/
# ifdef __NR_finit_module
{
int fd = open(filename, O_RDONLY | O_CLOEXEC);
if (fd >= 0) {
rc = finit_module(fd, options, 0) != 0;
close(fd);
if (rc == 0)
return rc;
}
}
# endif
image_size = INT_MAX - 4095;
mmaped = 0;
image = try_to_mmap_module(filename, &image_size);
if (image) {
mmaped = 1;
} else {
errno = ENOMEM; /* may be changed by e.g. open errors below */
image = xmalloc_open_zipped_read_close(filename, &image_size);
if (!image)
return -errno;
}
errno = 0;
init_module(image, image_size, options);
rc = errno;
if (mmaped)
munmap(image, image_size);
else
free(image);
return rc;
}
try_to_mmap_module()
函数首先打开模块文件获取模块文件描述符fd
,然后通过fstat()
函数获取模块文件的详细信息,判断模块文件的大小st_size
是否超过了设定的文件最大值,调用mmap_read()
函数以只读的方式将模块文件的内容映射进内存空间,并返回该内存空间的首地址,通过*(uint32_t*)image != SWAP_BE32(0x7f454C46)
检查模块文件是否符号 ELF 标准格式,最后将内存空间的首地址image
返回。通过try_to_mmap_module()
函数我们就获取了模块文件内容在内存空间的地址。
void* FAST_FUNC try_to_mmap_module(const char *filename, size_t *image_size_p)
{
/* We have user reports of failure to load 3MB module
* on a 16MB RAM machine. Apparently even a transient
* memory spike to 6MB during module load
* is too big for that system. */
void *image;
struct stat st;
int fd;
fd = xopen(filename, O_RDONLY);
fstat(fd, &st);
image = NULL;
/* st.st_size is off_t, we can't just pass it to mmap */
if (st.st_size <= *image_size_p) {
size_t image_size = st.st_size;
image = mmap_read(fd, image_size);
if (image == MAP_FAILED) {
image = NULL;
} else if (*(uint32_t*)image != SWAP_BE32(0x7f454C46)) {
/* No ELF signature. Compressed module? */
munmap(image, image_size);
image = NULL;
} else {
/* Success. Report the size */
*image_size_p = image_size;
}
}
close(fd);
return image;
}
从init_module()
开始调用关系会进入到 Linux 内核源码。
init_module()
其实是一个宏定义,最终会调用到__NR_init_module
系统调用号对应的系统调用函数是sys_init_module()
,该对应关系位于 Linux 内核源码include/uapi/asm-generic/unistd.h
文件中。关于 Linux 系统调用的知识,后面会专门写个文章分析 Linux 系统调用的实现机制,并手写一个内核没有的系统调用。
#define init_module(mod, len, opts) syscall(__NR_init_module, mod, len, opts)
#define __NR_init_module 105
__SYSCALL(__NR_init_module, sys_init_module)
而sys_init_module()
函数是由宏定义SYSCALL_DEFINE3
展开形成的,该定义位于文件include/linux/syscalls.h
中
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINEx(x, sname, ...) \\
SYSCALL_METADATA(sname, x, __VA_ARGS__) \\
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
#define __SYSCALL_DEFINEx(x, name, ...) \\
asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \\
__attribute__((alias(__stringify(SyS##name)))); \\
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \\
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \\
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \\
{ \\
long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \\
__MAP(x,__SC_TEST,__VA_ARGS__); \\
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \\
return ret; \\
} \\
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
SYSCALL_DEFINE3
的实现位于kernel/module.c
文件中,该函数首先调用may_init_module()
函数判断用户是否有加载模块的权限,调用copy_module_from_user()
函数将模块文件的内容从用户空间内存地址拷贝到内核空间内存地址,具体实现后面会分析,最后调用load_module()
函数,细节详见下面分析。
SYSCALL_DEFINE3(init_module, void __user *, umod,
unsigned long, len, const char __user *, uargs)
{
int err;
struct load_info info = { };
err = may_init_module();
if (err)
return err;
pr_debug("init_module: umod=%p, len=%lu, uargs=%p\\n",
umod, len, uargs);
err = copy_module_from_user(umod, len, &info);
if (err)
return err;
return load_module(&info, uargs, 0);
}
copy_module_from_user()
函数首先给load_info
结构体成员info->len
赋值为模块大小len
,调用__vmalloc()
函数在内核空间为模块分配info->len
大小的内存空间,并返回内核内存空间的的起始地址info->hdr
,最后调用copy_chunked_from_user()
函数其实就是copy_from_user()
函数将用户空间内存模块文件内容拷贝到info->hdr
所指向的内核空间内存地址
static int copy_module_from_user(const void __user *umod, unsigned long len,
struct load_info *info)
{
int err;
info- >len = len;
if (info- >len < sizeof(*(info- >hdr)))
return -ENOEXEC;
err = security_kernel_read_file(NULL, READING_MODULE);
if (err)
return err;
/* Suck in entire file: we'll want most of it. */
info- >hdr = __vmalloc(info- >len,
GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL);
if (!info- >hdr)
return -ENOMEM;
if (copy_chunked_from_user(info- >hdr, umod, info- >len) != 0) {
vfree(info- >hdr);
return -EFAULT;
}
return 0;
}
至此,模块文件已经从用户空间拷贝到内核空间。
模块加载
鉴于模块加载函数load_module()
比较复杂,限于篇幅限制,具体的加载过程会在《Linux内核模块加载深度剖析(中篇)》一文中分析。
static int load_module(struct load_info *info, const char __user *uargs,
int flags)
{
struct module *mod;
long err;
char *after_dashes;
err = module_sig_check(info, flags);
if (err)
goto free_copy;
err = elf_header_check(info);
if (err)
goto free_copy;
/* Figure out module layout, and allocate all the memory. */
mod = layout_and_allocate(info, flags);
if (IS_ERR(mod)) {
err = PTR_ERR(mod);
goto free_copy;
}
audit_log_kern_module(mod- >name);
/* Reserve our place in the list. */
err = add_unformed_module(mod);
if (err)
goto free_module;
#ifdef CONFIG_MODULE_SIG
mod- >sig_ok = info- >sig_ok;
if (!mod- >sig_ok) {
pr_notice_once("%s: module verification failed: signature "
"and/or required key missing - tainting "
"kernel\\n", mod- >name);
add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
}
#endif
/* To avoid stressing percpu allocator, do this once we're unique. */
err = percpu_modalloc(mod, info);
if (err)
goto unlink_mod;
/* Now module is in final location, initialize linked lists, etc. */
err = module_unload_init(mod);
if (err)
goto unlink_mod;
init_param_lock(mod);
/* Now we've got everything in the final locations, we can
* find optional sections. */
err = find_module_sections(mod, info);
if (err)
goto free_unload;
err = check_module_license_and_versions(mod);
if (err)
goto free_unload;
/* Set up MODINFO_ATTR fields */
setup_modinfo(mod, info);
/* Fix up syms, so that st_value is a pointer to location. */
err = simplify_symbols(mod, info);
if (err < 0)
goto free_modinfo;
err = apply_relocations(mod, info);
if (err < 0)
goto free_modinfo;
err = post_relocation(mod, info);
if (err < 0)
goto free_modinfo;
flush_module_icache(mod);
/* Now copy in args */
mod- >args = strndup_user(uargs, ~0UL > > 1);
if (IS_ERR(mod- >args)) {
err = PTR_ERR(mod- >args);
goto free_arch_cleanup;
}
dynamic_debug_setup(mod, info- >debug, info- >num_debug);
/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
ftrace_module_init(mod);
/* Finally it's fully formed, ready to start executing. */
err = complete_formation(mod, info);
if (err)
goto ddebug_cleanup;
err = prepare_coming_module(mod);
if (err)
goto bug_cleanup;
/* Module is ready to execute: parsing args may do that. */
after_dashes = parse_args(mod- >name, mod- >args, mod- >kp, mod- >num_kp,
-32768, 32767, mod,
unknown_module_param_cb);
if (IS_ERR(after_dashes)) {
err = PTR_ERR(after_dashes);
goto coming_cleanup;
} else if (after_dashes) {
pr_warn("%s: parameters '%s' after `--' ignored\\n",
mod- >name, after_dashes);
}
/* Link in to sysfs. */
err = mod_sysfs_setup(mod, info, mod- >kp, mod- >num_kp);
if (err < 0)
goto coming_cleanup;
if (is_livepatch_module(mod)) {
err = copy_module_elf(mod, info);
if (err < 0)
goto sysfs_cleanup;
}
/* Get rid of temporary copy. */
free_copy(info);
/* Done! */
trace_module_load(mod);
return do_init_module(mod);
sysfs_cleanup:
mod_sysfs_teardown(mod);
coming_cleanup:
mod- >state = MODULE_STATE_GOING;
destroy_params(mod- >kp, mod- >num_kp);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
klp_module_going(mod);
bug_cleanup:
mod- >state = MODULE_STATE_GOING;
/* module_bug_cleanup needs module_mutex protection */
mutex_lock(&module_mutex);
module_bug_cleanup(mod);
mutex_unlock(&module_mutex);
/* we can't deallocate the module until we clear memory protection */
module_disable_ro(mod);
module_disable_nx(mod);
ddebug_cleanup:
dynamic_debug_remove(mod, info- >debug);
synchronize_sched();
kfree(mod- >args);
free_arch_cleanup:
module_arch_cleanup(mod);
free_modinfo:
free_modinfo(mod);
free_unload:
module_unload_free(mod);
unlink_mod:
mutex_lock(&module_mutex);
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod- >list);
mod_tree_remove(mod);
wake_up_all(&module_wq);
/* Wait for RCU-sched synchronizing before releasing mod- >list. */
synchronize_sched();
mutex_unlock(&module_mutex);
free_module:
/*
* Ftrace needs to clean up what it initialized.
* This does nothing if ftrace_module_init() wasn't called,
* but it must be called outside of module_mutex.
*/
ftrace_release_mod(mod);
/* Free lock-classes; relies on the preceding sync_rcu() */
lockdep_free_key_range(