fork:通过fork创建新进程
vfork:主要用于马上执行exec的情况,因为马上就exec装入新的程序,所以可以去掉fork中分配新的地址空间等操作,进而加速进程创建
clone:功能更强,参数更多;主要用于创建线程/父子进程资源共享等,可以通过设置相应的参数实现fork、vfork的功能
I.系统调用
i.系统调用
- arch/x86/kernel/syscall_table_32.S
- 1 ENTRY(sys_call_table)
- 4 .long ptregs_fork /* 3 */
- 122 .long ptregs_clone /* 120 */
- 192 .long ptregs_vfork /* 190 */
-
- arch/x86/kernel/entry_32.S
- 709 /*
- 710 * System calls that need a pt_regs pointer.
- 711 */
- 712 #define PTREGSCALL(name) \
- 713 ALIGN; \
- 714 ptregs_##name: \
- 715 leal 4(%esp),%eax; \
- 716 jmp sys_##name;
- 717
- 718 PTREGSCALL(iopl)
- 719 PTREGSCALL(fork)
- 720 PTREGSCALL(clone)
- 721 PTREGSCALL(vfork)
-
- arch/x86/kernel/process.c
- 217 int sys_fork(struct pt_regs *regs)
- 218 {
- 219 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
- 220 }
- 221
- 222 /*
- 223 * This is trivial, and on the face of it looks like it
- 224 * could equally well be done in user mode.
- 225 *
- 226 * Not so, for quite unobvious reasons - register pressure.
- 227 * In user mode vfork() cannot have a stack frame, and if
- 228 * done by calling the "clone()" system call directly, you
- 229 * do not have enough call-clobbered registers to hold all
- 230 * the information you need.
- 231 */
- 232 int sys_vfork(struct pt_regs *regs)
- 233 {
- 234 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
- 235 NULL, NULL);
- 236 }
-
- arch/x86/kernel/process_32.c
- 432 int sys_clone(struct pt_regs *regs)
- 433 {
- 434 unsigned long clone_flags;
- 435 unsigned long newsp;
- 436 int __user *parent_tidptr, *child_tidptr;
- 437
- 438 clone_flags = regs->bx;
- 439 newsp = regs->cx;
- 440 parent_tidptr = (int __user *)regs->dx;
- 441 child_tidptr = (int __user *)regs->di;
- 442 if (!newsp)
- 443 newsp = regs->sp;
- 444 return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
- 445 }
可以看出fork,vfork,clone都是通过do_fork实现,只是传入参数不同而已
ii.do_fork
- kernel/fork.c
- 1363 * Ok, this is the main fork-routine.
- 1364 *
- 1365 * It copies the process, and if successful kick-starts
- 1366 * it and waits for it to finish using the VM if required.
- 1367 */
- 1368 long do_fork(unsigned long clone_flags,
- 1369 unsigned long stack_start,
- 1370 struct pt_regs *regs,
- 1371 unsigned long stack_size,
- 1372 int __user *parent_tidptr,
- 1373 int __user *child_tidptr)
- 1374 {
- 1375 struct task_struct *p;
- 1376 int trace = 0;
- 1377 long nr;
- 1378
- 1379 /*
- 1380 * Do some preliminary argument and permissions checking before we
- 1381 * actually start allocating stuff
- 1382 */
- 1383 if (clone_flags & CLONE_NEWUSER) {
- 1384 if (clone_flags & CLONE_THREAD)
- 1385 return -EINVAL;
- 1386 /* hopefully this check will go away when userns support is
- 1387 * complete
- 1388 */
- 1389 if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
- 1390 !capable(CAP_SETGID))
- 1391 return -EPERM;
- 1392 }
- 1393
- 1394 /*
- 1395 * We hope to recycle these flags after 2.6.26
- 1396 */
- 1397 if (unlikely(clone_flags & CLONE_STOPPED)) {
- 1398 static int __read_mostly count = 100;
- 1399
- 1400 if (count > 0 && printk_ratelimit()) {
- 1401 char comm[TASK_COMM_LEN];
- 1402
- 1403 count--;
- 1404 printk(KERN_INFO "fork(): process `%s' used deprecated "
- 1405 "clone flags 0x%lx\n",
- 1406 get_task_comm(comm, current),
- 1407 clone_flags & CLONE_STOPPED);
- 1408 }
- 1409 }
- 1410
- 1411 /*
- 1412 * When called from kernel_thread, don't do user tracing stuff.
- 1413 */
- 1414 if (likely(user_mode(regs)))
- 1415 trace = tracehook_prepare_clone(clone_flags);
- 1416
- 1417 p = copy_process(clone_flags, stack_start, regs, stack_size,
- 1418 child_tidptr, NULL, trace);
- 1419 /*
- 1420 * Do this prior waking up the new thread - the thread pointer
- 1421 * might get invalid after that point, if the thread exits quickly.
- 1422 */
- 1423 if (!IS_ERR(p)) {
- 1424 struct completion vfork;
- 1425
- 1426 trace_sched_process_fork(current, p);
- 1427
- 1428 nr = task_pid_vnr(p);
- 1429
- 1430 if (clone_flags & CLONE_PARENT_SETTID)
- 1431 put_user(nr, parent_tidptr);
- 1432
- 1433 if (clone_flags & CLONE_VFORK) {
- 1434 p->vfork_done = &vfork;
- 1435 init_completion(&vfork);
- 1436 }
- 1437
- 1438 audit_finish_fork(p);
- 1439 tracehook_report_clone(regs, clone_flags, nr, p);
- 1440
- 1441 /*
- 1442 * We set PF_STARTING at creation in case tracing wants to
- 1443 * use this to distinguish a fully live task from one that
- 1444 * hasn't gotten to tracehook_report_clone() yet. Now we
- 1445 * clear it and set the child going.
- 1446 */
- 1447 p->flags &= ~PF_STARTING;
- 1449 if (unlikely(clone_flags & CLONE_STOPPED)) {
- 1450 /*
- 1451 * We'll start up with an immediate SIGSTOP.
- 1452 */
- 1453 sigaddset(&p->pending.signal, SIGSTOP);
- 1454 set_tsk_thread_flag(p, TIF_SIGPENDING);
- 1455 __set_task_state(p, TASK_STOPPED);
- 1456 } else {
- 1457 wake_up_new_task(p, clone_flags);
- 1458 }
- 1459
- 1460 tracehook_report_clone_complete(trace, regs,
- 1461 clone_flags, nr, p);
- 1462
- 1463 if (clone_flags & CLONE_VFORK) {
- 1464 freezer_do_not_count();
- 1465 wait_for_completion(&vfork);
- 1466 freezer_count();
- 1467 tracehook_report_vfork_done(p, nr);
- 1468 }
- 1469 } else {
- 1470 nr = PTR_ERR(p);
- 1471 }
- 1472 return nr;
- 1473 }
1.输入参数及权限检查
2.复制进程
3.取子进程id
4.如果设置CLONE_PARENT_SETTID,则将子进程id放入parent_tidptr中
5.如果是vfork系统调用/带CLONE_VFORK标识的clone,初始化completion;用于暂停父进程在vfork/clone中,在子进程执行exec/exit后父进程再继续执行
6.唤醒子进程,在copy_thread中设置子进程的运行环境;子进程从ret_from_fork执行,内核堆栈与父进程刚进入fork/vfork/clone的内核堆栈相同(返回值ax设置成0,sp设置成新的用户堆栈),直接退出系统调用即可。
7.如果是vfork系统调用/带CLONE_VFORK标识的clone,等待子进程执行exec/exit
II.复制进程copy_process
- kernel/fork.c
- 973 /*
- 974 * This creates a new process as a copy of the old one,
- 975 * but does not actually start it yet.
- 976 *
- 977 * It copies the registers, and all the appropriate
- 978 * parts of the process environment (as per the clone
- 979 * flags). The actual kick-off is left to the caller.
- 980 */
- 981 static struct task_struct *copy_process(unsigned long clone_flags,
- 982 unsigned long stack_start,
- 983 struct pt_regs *regs,
- 984 unsigned long stack_size,
- 985 int __user *child_tidptr,
- 986 struct pid *pid,
- 987 int trace)
- 988 {
- 989 int retval;
- 990 struct task_struct *p;
- 991 int cgroup_callbacks_done = 0;
- 992
- 993 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
- 994 return ERR_PTR(-EINVAL);
- 995
- 996 /*
- 997 * Thread groups must share signals as well, and detached threads
- 998 * can only be started up within the thread group.
- 999 */
- 1000 if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
- 1001 return ERR_PTR(-EINVAL);
- 1002
- 1003 /*
- 1004 * Shared signal handlers imply shared VM. By way of the above,
- 1005 * thread groups also imply shared VM. Blocking this case allows
- 1006 * for various simplifications in other code.
- 1007 */
- 1008 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
- 1009 return ERR_PTR(-EINVAL);
- 1010
- 1011 /*
- 1012 * Siblings of global init remain as zombies on exit since they are
- 1013 * not reaped by their parent (swapper). To solve this and to avoid
- 1014 * multi-rooted process trees, prevent global and container-inits
- 1015 * from creating siblings.
- 1016 */
- 1017 if ((clone_flags & CLONE_PARENT) &&
- 1018 current->signal->flags & SIGNAL_UNKILLABLE)
- 1019 return ERR_PTR(-EINVAL);
- 1020
- 1021 retval = security_task_create(clone_flags);
- 1022 if (retval)
- 1023 goto fork_out;
- 1024
- 1025 retval = -ENOMEM;
- 1026 p = dup_task_struct(current);
- 1027 if (!p)
- 1028 goto fork_out;
- 1029
- 1030 ftrace_graph_init_task(p);
- 1031
- 1032 rt_mutex_init_task(p);
- 1033
- 1034 #ifdef CONFIG_PROVE_LOCKING
- 1035 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
- 1036 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
- 1037 #endif
- 1038 retval = -EAGAIN;
- 1039 if (atomic_read(&p->real_cred->user->processes) >=
- 1040 p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
- 1041 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
- 1042 p->real_cred->user != INIT_USER)
- 1043 goto bad_fork_free;
- 1044 }
- 1045
- 1046 retval = copy_creds(p, clone_flags);
- 1047 if (retval < 0)
- 1048 goto bad_fork_free;
- 1049
- 1050 /*
- 1051 * If multiple threads are within copy_process(), then this check
- 1052 * triggers too late. This doesn't hurt, the check is only there
- 1053 * to stop root fork bombs.
- 1054 */
- 1055 retval = -EAGAIN;
- 1056 if (nr_threads >= max_threads)
- 1057 goto bad_fork_cleanup_count;
- 1058
- 1059 if (!try_module_get(task_thread_info(p)->exec_domain->module))
- 1060 goto bad_fork_cleanup_count;
- 1061
- 1062 p->did_exec = 0;
- 1063 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
- 1064 copy_flags(clone_flags, p);
- 1065 INIT_LIST_HEAD(&p->children);
- 1066 INIT_LIST_HEAD(&p->sibling);
- 1067 rcu_copy_process(p);
- 1068 p->vfork_done = NULL;
- 1069 spin_lock_init(&p->alloc_lock);
- 1070
- 1071 init_sigpending(&p->pending);
- 1072
- 1073 p->utime = cputime_zero;
- 1074 p->stime = cputime_zero;
- 1075 p->gtime = cputime_zero;
- 1076 p->utimescaled = cputime_zero;
- 1077 p->stimescaled = cputime_zero;
- 1078 p->prev_utime = cputime_zero;
- 1079 p->prev_stime = cputime_zero;
- 1080
- 1081 p->default_timer_slack_ns = current->timer_slack_ns;
- 1082
- 1083 task_io_accounting_init(&p->ioac);
- 1084 acct_clear_integrals(p);
- 1085
- 1086 posix_cpu_timers_init(p);
- 1087
- 1088 p->lock_depth = -1; /* -1 = no lock */
- 1089 do_posix_clock_monotonic_gettime(&p->start_time);
- 1090 p->real_start_time = p->start_time;
- 1091 monotonic_to_bootbased(&p->real_start_time);
- 1092 p->io_context = NULL;
- 1093 p->audit_context = NULL;
- 1094 cgroup_fork(p);
- 1095 #ifdef CONFIG_NUMA
- 1096 p->mempolicy = mpol_dup(p->mempolicy);
- 1097 if (IS_ERR(p->mempolicy)) {
- 1098 retval = PTR_ERR(p->mempolicy);
- 1099 p->mempolicy = NULL;
- 1100 goto bad_fork_cleanup_cgroup;
- 1101 }
- 1102 mpol_fix_fork_child_flag(p);
- 1103 #endif
- 1104 #ifdef CONFIG_TRACE_IRQFLAGS
- 1105 p->irq_events = 0;
- 1106 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- 1107 p->hardirqs_enabled = 1;
- 1108 #else
- 1109 p->hardirqs_enabled = 0;
- 1110 #endif
- 1111 p->hardirq_enable_ip = 0;
- 1112 p->hardirq_enable_event = 0;
- 1113 p->hardirq_disable_ip = _THIS_IP_;
- 1114 p->hardirq_disable_event = 0;
- 1115 p->softirqs_enabled = 1;
- 1116 p->softirq_enable_ip = _THIS_IP_;
- 1117 p->softirq_enable_event = 0;
- 1118 p->softirq_disable_ip = 0;
- 1119 p->softirq_disable_event = 0;
- 1120 p->hardirq_context = 0;
- 1121 p->softirq_context = 0;
- 1122 #endif
- 1123 #ifdef CONFIG_LOCKDEP
- 1124 p->lockdep_depth = 0; /* no locks held yet */
- 1125 p->curr_chain_key = 0;
- 1126 p->lockdep_recursion = 0;
- 1127 #endif
- 1128
- 1129 #ifdef CONFIG_DEBUG_MUTEXES
- 1130 p->blocked_on = NULL; /* not blocked yet */
- 1131 #endif
- 1132
- 1133 p->bts = NULL;
- 1134
- 1135 /* Perform scheduler related setup. Assign this task to a CPU. */
- 1136 sched_fork(p, clone_flags);
- 1137
- 1138 retval = perf_event_init_task(p);
- 1139 if (retval)
- 1140 goto bad_fork_cleanup_policy;
- 1141
- 1142 if ((retval = audit_alloc(p)))
- 1143 goto bad_fork_cleanup_policy;
- 1144 /* copy all the process information */
- 1145 if ((retval = copy_semundo(clone_flags, p)))
- 1146 goto bad_fork_cleanup_audit;
- 1147 if ((retval = copy_files(clone_flags, p)))
- 1148 goto bad_fork_cleanup_semundo;
- 1149 if ((retval = copy_fs(clone_flags, p)))
- 1150 goto bad_fork_cleanup_files;
- 1151 if ((retval = copy_sighand(clone_flags, p)))
- 1152 goto bad_fork_cleanup_fs;
- 1153 if ((retval = copy_signal(clone_flags, p)))
- 1154 goto bad_fork_cleanup_sighand;
- 1155 if ((retval = copy_mm(clone_flags, p)))
- 1156 goto bad_fork_cleanup_signal;
- 1157 if ((retval = copy_namespaces(clone_flags, p)))
- 1158 goto bad_fork_cleanup_mm;
- 1159 if ((retval = copy_io(clone_flags, p)))
- 1160 goto bad_fork_cleanup_namespaces;
- 1161 retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
- 1162 if (retval)
- 1163 goto bad_fork_cleanup_io;
- 1164
- 1165 if (pid != &init_struct_pid) {
- 1166 retval = -ENOMEM;
- 1167 pid = alloc_pid(p->nsproxy->pid_ns);
- 1168 if (!pid)
- 1169 goto bad_fork_cleanup_io;
- 1170
- 1171 if (clone_flags & CLONE_NEWPID) {
- 1172 retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
- 1173 if (retval < 0)
- 1174 goto bad_fork_free_pid;
- 1175 }
- 1176 }
- 1177
- 1178 p->pid = pid_nr(pid);
- 1179 p->tgid = p->pid;
- 1180 if (clone_flags & CLONE_THREAD)
- 1181 p->tgid = current->tgid;
- 1182
- 1183 if (current->nsproxy != p->nsproxy) {
- 1184 retval = ns_cgroup_clone(p, pid);
- 1185 if (retval)
- 1186 goto bad_fork_free_pid;
- 1187 }
- 1188
- 1189 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
- 1190 /*
- 1191 * Clear TID on mm_release()?
- 1192 */
- 1193 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
- 1194 #ifdef CONFIG_FUTEX
- 1195 p->robust_list = NULL;
- 1196 #ifdef CONFIG_COMPAT
- 1197 p->compat_robust_list = NULL;
- 1198 #endif
- 1199 INIT_LIST_HEAD(&p->pi_state_list);
- 1200 p->pi_state_cache = NULL;
- 1201 #endif
- 1202 /*
- 1203 * sigaltstack should be cleared when sharing the same VM
- 1204 */
- 1205 if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
- 1206 p->sas_ss_sp = p->sas_ss_size = 0;
- 1207
- 1208 /*
- 1209 * Syscall tracing should be turned off in the child regardless
- 1210 * of CLONE_PTRACE.
- 1211 */
- 1212 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
- 1213 #ifdef TIF_SYSCALL_EMU
- 1214 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
- 1215 #endif
- 1216 clear_all_latency_tracing(p);
- 1217
- 1218 /* ok, now we should be set up.. */
- 1219 p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
- 1220 p->pdeath_signal = 0;
- 1221 p->exit_state = 0;
- 1222
- 1223 /*
- 1224 * Ok, make it visible to the rest of the system.
- 1225 * We dont wake it up yet.
- 1226 */
- 1227 p->group_leader = p;
- 1228 INIT_LIST_HEAD(&p->thread_group);
- 1229
- 1230 /* Now that the task is set up, run cgroup callbacks if
- 1231 * necessary. We need to run them before the task is visible
- 1232 * on the tasklist. */
- 1233 cgroup_fork_callbacks(p);
- 1234 cgroup_callbacks_done = 1;
- 1235
- 1236 /* Need tasklist lock for parent etc handling! */
- 1237 write_lock_irq(&tasklist_lock);
- 1238
- 1239 /* CLONE_PARENT re-uses the old parent */
- 1240 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
- 1241 p->real_parent = current->real_parent;
- 1242 p->parent_exec_id = current->parent_exec_id;
- 1243 } else {
- 1244 p->real_parent = current;
- 1245 p->parent_exec_id = current->self_exec_id;
- 1246 }
- 1247
- 1248 spin_lock(?t->sighand->siglock);
- 1249
- 1250 /*
- 1251 * Process group and session signals need to be delivered to just the
- 1252 * parent before the fork or both the parent and the child after the
- 1253 * fork. Restart if a signal comes in before we add the new process to
- 1254 * it's process group.
- 1255 * A fatal signal pending means that current will exit, so the new
- 1256 * thread can't slip out of an OOM kill (or normal SIGKILL).
- 1257 */
- 1258 recalc_sigpending();
- 1259 if (signal_pending(current)) {
- 1260 spin_unlock(?t->sighand->siglock);
- 1261 write_unlock_irq(&tasklist_lock);
- 1262 retval = -ERESTARTNOINTR;
- 1263 goto bad_fork_free_pid;
- 1264 }
- 1265
- 1266 if (clone_flags & CLONE_THREAD) {
- 1267 atomic_inc(?t->signal->count);
- 1268 atomic_inc(?t->signal->live);
- 1269 p->group_leader = current->group_leader;
- 1270 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
- 1271 }
- 1272
- 1273 if (likely(p->pid)) {
- 1274 list_add_tail(&p->sibling, &p->real_parent->children);
- 1275 tracehook_finish_clone(p, clone_flags, trace);
- 1276
- 1277 if (thread_group_leader(p)) {
- 1278 if (clone_flags & CLONE_NEWPID)
- 1279 p->nsproxy->pid_ns->child_reaper = p;
- 1280
- 1281 p->signal->leader_pid = pid;
- 1282 tty_kref_put(p->signal->tty);
- 1283 p->signal->tty = tty_kref_get(current->signal->tty);
- 1284 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
- 1285 attach_pid(p, PIDTYPE_SID, task_session(current));
- 1286 list_add_tail_rcu(&p->tasks, &init_task.tasks);
- 1287 __get_cpu_var(process_counts)++;
- 1288 }
- 1289 attach_pid(p, PIDTYPE_PID, pid);
- 1290 nr_threads++;
- 1291 }
- 1292
- 1293 total_forks++;
- 1294 spin_unlock(?t->sighand->siglock);
- 1295 write_unlock_irq(&tasklist_lock);
- 1296 proc_fork_connector(p);
- 1297 cgroup_post_fork(p);
- 1298 perf_event_fork(p);
- 1299 return p;
- 1300
- 1301 bad_fork_free_pid:
- 1302 if (pid != &init_struct_pid)
- 1303 free_pid(pid);
- 1304 bad_fork_cleanup_io:
- 1305 if (p->io_context)
- 1306 exit_io_context(p);
- 1307 bad_fork_cleanup_namespaces:
- 1308 exit_task_namespaces(p);
- 1309 bad_fork_cleanup_mm:
- 1310 if (p->mm)
- 1311 mmput(p->mm);
- 1312 bad_fork_cleanup_signal:
- 1313 if (!(clone_flags & CLONE_THREAD))
- 1314 __cleanup_signal(p->signal);
- 1315 bad_fork_cleanup_sighand:
- 1316 __cleanup_sighand(p->sighand);
- 1317 bad_fork_cleanup_fs:
- 1318 exit_fs(p); /* blocking */
- 1319 bad_fork_cleanup_files:
- 1320 exit_files(p); /* blocking */
- 1321 bad_fork_cleanup_semundo:
- 1322 exit_sem(p);
- 1323 bad_fork_cleanup_audit:
- 1324 audit_free(p);
- 1325 bad_fork_cleanup_policy:
- 1326 perf_event_free_task(p);
- 1327 #ifdef CONFIG_NUMA
- 1328 mpol_put(p->mempolicy);
- 1329 bad_fork_cleanup_cgroup:
- 1330 #endif
- 1331 cgroup_exit(p, cgroup_callbacks_done);
- 1332 delayacct_tsk_free(p);
- 1333 module_put(task_thread_info(p)->exec_domain->module);
- 1334 bad_fork_cleanup_count:
- 1335 atomic_dec(&p->cred->user->processes);
- 1336 exit_creds(p);
- 1337 bad_fork_free:
- 1338 free_task(p);
- 1339 fork_out:
- 1340 return ERR_PTR(retval);
- 1341 }
i.clone_flags共存标识检查
ii.复制进程描述符dup_task_struct
iii.检查用户当前进程数未超限
iv.复制凭证copy_creds
v.检查当前系统范围内进程数未超限;max_threads在fork_init中初始化,=可用内存/内核椎栈大小/8(每个进程都有自己的内核椎栈,每个椎栈大小是4k或8k),即内核堆栈大小不能超过可用内存的1/8
vi.初始化子进程进程描述符,即与父进程不同的地方复制成新值;清空子进程链表、清空pending的信号、设置时间等
vii.初始化进程调度信息sched_fork
viii.复制信号量撤销链表copy_semundo
ix.复制打开文件信息copy_files
x.复制文件系统信息copy_fs
xi.复制信号处理描述符copy_sighand
xii.复制信号描述符copy_signal
xiii.复制地址空间信息copy_mm
xiv.复制命名空间copy_namespaces
xv.复制子进程运行环境copy_thread
xvi.分配子进程id,并设置相应的进程号pid、线程组长号tgid、线程组长group_leader、父进程real_parent
xvii.将进程链入父进程的子进程链表、进程组成员链表、会话组成员链表、进程链表
ii.dup_task_struct
- kernel/fork.c
- 222 static struct task_struct *dup_task_struct(struct task_struct *orig)
- 223 {
- 224 struct task_struct *tsk;
- 225 struct thread_info *ti;
- 226 unsigned long *stackend;
- 227
- 228 int err;
- 229
- 230 prepare_to_copy(orig);
- 231
- 232 tsk = alloc_task_struct();
- 233 if (!tsk)
- 234 return NULL;
- 235
- 236 ti = alloc_thread_info(tsk);
- 237 if (!ti) {
- 238 free_task_struct(tsk);
- 239 return NULL;
- 240 }
- 241
- 242 err = arch_dup_task_struct(tsk, orig);
- 243 if (err)
- 244 goto out;
- 245
- 246 tsk->stack = ti;
- 247
- 248 err = prop_local_init_single(&tsk->dirties);
- 249 if (err)
- 250 goto out;
- 251
- 252 setup_thread_stack(tsk, orig);
- 253 stackend = end_of_stack(tsk);
- 254 *stackend = STACK_END_MAGIC; /* for overflow detection */
- 255
- 256 #ifdef CONFIG_CC_STACKPROTECTOR
- 257 tsk->stack_canary = get_random_int();
- 258 #endif
- 259
- 260 /* One for us, one for whoever does the "release_task()" (usually parent) */
- 261 atomic_set(&tsk->usage,2);
- 262 atomic_set(&tsk->fs_excl, 0);
- 263 #ifdef CONFIG_BLK_DEV_IO_TRACE
- 264 tsk->btrace_seq = 0;
- 265 #endif
- 266 tsk->splice_pipe = NULL;
- 267
- 268 account_kernel_stack(ti, 1);
- 269
- 270 return tsk;
- 271
- 272 out:
- 273 free_thread_info(ti);
- 274 free_task_struct(tsk);
- 275 return NULL;
- 276 }
1.分配进程描述符
2.分配thread_info及内核堆栈
3.复制父进程描述符信息
4.进程描述符与内核堆栈关联起来
5.复制thread_info信息
6.内核堆栈设置幻数,以防堆栈越界
7.设置进程描述符引用计数
iv.copy_creds
- kernel/cred.c
- 427 /*
- 428 * Copy credentials for the new process created by fork()
- 429 *
- 430 * We share if we can, but under some circumstances we have to generate a new
- 431 * set.
- 432 *
- 433 * The new process gets the current process's subjective credentials as its
- 434 * objective and subjective credentials
- 435 */
- 436 int copy_creds(struct task_struct *p, unsigned long clone_flags)
- 437 {
- 438 #ifdef CONFIG_KEYS
- 439 struct thread_group_cred *tgcred;
- 440 #endif
- 441 struct cred *new;
- 442 int ret;
- 443
- 444 mutex_init(&p->cred_guard_mutex);
- 445
- 446 p->replacement_session_keyring = NULL;
- 447
- 448 if (
- 449 #ifdef CONFIG_KEYS
- 450 !p->cred->thread_keyring &&
- 451 #endif
- 452 clone_flags & CLONE_THREAD
- 453 ) {
- 454 p->real_cred = get_cred(p->cred);
- 455 get_cred(p->cred);
- 456 alter_cred_subscribers(p->cred, 2);
- 457 kdebug("share_creds(%p{%d,%d})",
- 458 p->cred, atomic_read(&p->cred->usage),
- 459 read_cred_subscribers(p->cred));
- 460 atomic_inc(&p->cred->user->processes);
- 461 return 0;
- 462 }
- 463
- 464 new = prepare_creds();
- 465 if (!new)
- 466 return -ENOMEM;
- 467
- 468 if (clone_flags & CLONE_NEWUSER) {
- 469 ret = create_user_ns(new);
- 470 if (ret < 0)
- 471 goto error_put;
- 472 }
- 473
- 474 #ifdef CONFIG_KEYS
- 475 /* new threads get their own thread keyrings if their parent already
- 476 * had one */
- 477 if (new->thread_keyring) {
- 478 key_put(new->thread_keyring);
- 479 new->thread_keyring = NULL;
- 480 if (clone_flags & CLONE_THREAD)
- 481 install_thread_keyring_to_cred(new);
- 482 }
- 483
- 484 /* we share the process and session keyrings between all the threads in
- 485 * a process - this is slightly icky as we violate COW credentials a
- 486 * bit */
- 487 if (!(clone_flags & CLONE_THREAD)) {
- 488 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
- 489 if (!tgcred) {
- 490 ret = -ENOMEM;
- 491 goto error_put;
- 492 }
- 493 atomic_set(&tgcred->usage, 1);
- 494 spin_lock_init(&tgcred->lock);
- 495 tgcred->process_keyring = NULL;
- 496 tgcred->session_keyring = key_get(new->tgcred->session_keyring);
- 497
- 498 release_tgcred(new);
- 499 new->tgcred = tgcred;
- 500 }
- 501 #endif
- 502
- 503 atomic_inc(&new->user->processes);
- 504 p->cred = p->real_cred = get_cred(new);
- 505 alter_cred_subscribers(new, 2);
- 506 validate_creds(new);
- 507 return 0;
- 508
- 509 error_put:
- 510 put_cred(new);
- 511 return ret;
- 512 }
子进程的主/客体凭证复制自父进程的主体凭证
1.如果是创建线程
父进程的主体凭证引用计数器加1,将子进程的主/客体凭证指向父进程的主体凭证即可
2.如果是创建进程
a.分配凭证描述符并初始化,复制父进程主体凭证、引用计数器置1等
b.子进程主/客体凭证指向新的凭证描述符
用户进程数计数器加1
viii.copy_semundo
- /* ipc/sem.c */
- 1244 /* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between
- 1245 * parent and child tasks.
- 1246 */
- 1247
- 1248 int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
- 1249 {
- 1250 struct sem_undo_list *undo_list;
- 1251 int error;
- 1252
- 1253 if (clone_flags & CLONE_SYSVSEM) {
- 1254 error = get_undo_list(&undo_list);
- 1255 if (error)
- 1256 return error;
- 1257 atomic_inc(&undo_list->refcnt);
- 1258 tsk->sysvsem.undo_list = undo_list;
- 1259 } else
- 1260 tsk->sysvsem.undo_list = NULL;
- 1261
- 1262 return 0;
- 1263 }
1.如果与父进程共享信号量,将子进程的信号量撤销链表指向父进程的撤销链表
2.如果不与父进程共享信号量,清空子进程的信号量撤销链表
ix.copy_files
- /* kernel/fork.c */
- 748 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
- 749 {
- 750 struct files_struct *oldf, *newf;
- 751 int error = 0;
- 752
- 753 /*
- 754 * A background process may not have any files ...
- 755 */
- 756 oldf = current->files;
- 757 if (!oldf)
- 758 goto out;
- 759
- 760 if (clone_flags & CLONE_FILES) {
- 761 atomic_inc(&oldf->count);
- 762 goto out;
- 763 }
- 764
- 765 newf = dup_fd(oldf, &error);
- 766 if (!newf)
- 767 goto out;
- 768
- 769 tsk->files = newf;
- 770 error = 0;
- 771 out:
- 772 return error;
- 773 }
-
- /* fs/file.c */
- 289 /*
- 290 * Allocate a new files structure and copy contents from the
- 291 * passed in files structure.
- 292 * errorp will be valid only when the returned files_struct is NULL.
- 293 */
- 294 struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
- 295 {
- 296 struct files_struct *newf;
- 297 struct file **old_fds, **new_fds;
- 298 int open_files, size, i;
- 299 struct fdtable *old_fdt, *new_fdt;
- 300
- 301 *errorp = -ENOMEM;
- 302 newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
- 303 if (!newf)
- 304 goto out;
- 305
- 306 atomic_set(&newf->count, 1);
- 307
- 308 spin_lock_init(&newf->file_lock);
- 309 newf->next_fd = 0;
- 310 new_fdt = &newf->fdtab;
- 311 new_fdt->max_fds = NR_OPEN_DEFAULT;
- 312 new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
- 313 new_fdt->open_fds = (fd_set *)&newf->open_fds_init;
- 314 new_fdt->fd = &newf->fd_array[0];
- 315 INIT_RCU_HEAD(&new_fdt->rcu);
- 316 new_fdt->next = NULL;
- 317
- 318 spin_lock(&oldf->file_lock);
- 319 old_fdt = files_fdtable(oldf);
- 320 open_files = count_open_files(old_fdt);
- 321
- 322 /*
- 323 * Check whether we need to allocate a larger fd array and fd set.
- 324 */
- 325 while (unlikely(open_files > new_fdt->max_fds)) {
- 326 spin_unlock(&oldf->file_lock);
- 327
- 328 if (new_fdt != &newf->fdtab) {
- 329 free_fdarr(new_fdt);
- 330 free_fdset(new_fdt);
- 331 kfree(new_fdt);
- 332 }
- 333
- 334 new_fdt = alloc_fdtable(open_files - 1);
- 335 if (!new_fdt) {
- 336 *errorp = -ENOMEM;
- 337 goto out_release;
- 338 }
- 339
- 340 /* beyond sysctl_nr_open; nothing to do */
- 341 if (unlikely(new_fdt->max_fds < open_files)) {
- 342 free_fdarr(new_fdt);
- 343 free_fdset(new_fdt);
- 344 kfree(new_fdt);
- 345 *errorp = -EMFILE;
- 346 goto out_release;
- 347 }
- 348
- 349 /*
- 350 * Reacquire the oldf lock and a pointer to its fd table
- 351 * who knows it may have a new bigger fd table. We need
- 352 * the latest pointer.
- 353 */
- 354 spin_lock(&oldf->file_lock);
- 355 old_fdt = files_fdtable(oldf);
- 356 open_files = count_open_files(old_fdt);
- 357 }
- 358
- 359 old_fds = old_fdt->fd;
- 360 new_fds = new_fdt->fd;
- 361
- 362 memcpy(new_fdt->open_fds->fds_bits,
- 363 old_fdt->open_fds->fds_bits, open_files/8);
- 364 memcpy(new_fdt->close_on_exec->fds_bits,
- 365 old_fdt->close_on_exec->fds_bits, open_files/8);
- 366
- 367 for (i = open_files; i != 0; i--) {
- 368 struct file *f = *old_fds++;
- 369 if (f) {
- 370 get_file(f);
- 371 } else {
- 372 /*
- 373 * The fd may be claimed in the fd bitmap but not yet
- 374 * instantiated in the files array if a sibling thread
- 375 * is partway through open(). So make sure that this
- 376 * fd is available to the new process.
- 377 */
- 378 FD_CLR(open_files - i, new_fdt->open_fds);
- 379 }
- 380 rcu_assign_pointer(*new_fds++, f);
- 381 }
- 382 spin_unlock(&oldf->file_lock);
- 383
- 384 /* compute the remainder to be cleared */
- 385 size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
- 386
- 387 /* This is long word aligned thus could use a optimized version */
- 388 memset(new_fds, 0, size);
- 389
- 390 if (new_fdt->max_fds > open_files) {
- 391 int left = (new_fdt->max_fds-open_files)/8;
- 392 int start = open_files / (8 * sizeof(unsigned long));
- 393
- 394 memset(&new_fdt->open_fds->fds_bits[start], 0, left);
- 395 memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
- 396 }
- 397
- 398 rcu_assign_pointer(newf->fdt, new_fdt);
- 399
- 400 return newf;
- 401
- 402 out_release:
- 403 kmem_cache_free(files_cachep, newf);
- 404 out:
- 405 return NULL;
- 406 }
1.如果共享打开文件,将父进程打开文件描述符引用计数器加1
2.如果不共享,复制父进程打开文件描述符;
A.分配打开文件描述符
B.初始化打开文件描述符
a.打开文件描述符引用计数器置1
b.将打开文件描述符表指向预分配的文件描述符表,并初始化描述符表的打开文件位图、exec关闭文件位图、file数组指向预分配结构
C.如果预分配的文件描述符表小于父进程已经打开的文件,则重新分配打开文件描述符表
D.复制父进程打开文件位图、exec关闭文件位图信息
E.将父进程打开的文件引用计数器加1,并添加到子进程打开文件描述符表中
F.将文件描述符表中多出的file数组、打开文件位图、exec关闭文件位图清空
G.将打开文件描述符中的fdt指向文件描述符表
下图表示files_struct与file之间的关系:
x.copy_fs- /* kernel/fork.c */
- 728 static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
- 729 {
- 730 struct fs_struct *fs = current->fs;
- 731 if (clone_flags & CLONE_FS) {
- 732 /* tsk->fs is already what we want */
- 733 write_lock(&fs->lock);
- 734 if (fs->in_exec) {
- 735 write_unlock(&fs->lock);
- 736 return -EAGAIN;
- 737 }
- 738 fs->users++;
- 739 write_unlock(&fs->lock);
- 740 return 0;
- 741 }
- 742 tsk->fs = copy_fs_struct(fs);
- 743 if (!tsk->fs)
- 744 return -ENOMEM;
- 745 return 0;
- 746 }
1.如果CLONE_FS,文件系统描述符引用计数器加1
2.复制文件系统描述符,包含当前目录及根目录信息
xi.copy_sighand
- /* kernel/fork.c */
- 800 static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
- 801 {
- 802 struct sighand_struct *sig;
- 803
- 804 if (clone_flags & CLONE_SIGHAND) {
- 805 atomic_inc(?t->sighand->count);
- 806 return 0;
- 807 }
- 808 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
- 809 rcu_assign_pointer(tsk->sighand, sig);
- 810 if (!sig)
- 811 return -ENOMEM;
- 812 atomic_set(&sig->count, 1);
- 813 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
- 814 return 0;
- 815 }
1.如果共享信号处理,父进程信号处理描述符引用计数器加1即可
2.分配信号处理描述符,复制父进程信号处理描述符表;新的信号处理描述符表引用计数器置1,并与子进程关联
xii.copy_signal
- /* kernel/fork.c */
- 857 static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
- 858 {
- 859 struct signal_struct *sig;
- 860
- 861 if (clone_flags & CLONE_THREAD)
- 862 return 0;
- 863
- 864 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
- 865 tsk->signal = sig;
- 866 if (!sig)
- 867 return -ENOMEM;
- 868
- 869 atomic_set(&sig->count, 1);
- 870 atomic_set(&sig->live, 1);
- 871 init_waitqueue_head(&sig->wait_chldexit);
- 872 sig->flags = 0;
- 873 if (clone_flags & CLONE_NEWPID)
- 874 sig->flags |= SIGNAL_UNKILLABLE;
- 875 sig->group_exit_code = 0;
- 876 sig->group_exit_task = NULL;
- 877 sig->group_stop_count = 0;
- 878 sig->curr_target = tsk;
- 879 init_sigpending(&sig->shared_pending);
- 880 INIT_LIST_HEAD(&sig->posix_timers);
- 881
- 882 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- 883 sig->it_real_incr.tv64 = 0;
- 884 sig->real_timer.function = it_real_fn;
- 885
- 886 sig->leader = 0; /* session leadership doesn't inherit */
- 887 sig->tty_old_pgrp = NULL;
- 888 sig->tty = NULL;
- 889
- 890 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
- 891 sig->gtime = cputime_zero;
- 892 sig->cgtime = cputime_zero;
- 893 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
- 894 sig->prev_utime = sig->prev_stime = cputime_zero;
- 895 #endif
- 896 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
- 897 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
- 898 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
- 899 sig->maxrss = sig->cmaxrss = 0;
- 900 task_io_accounting_init(&sig->ioac);
- 901 sig->sum_sched_runtime = 0;
- 902 taskstats_tgid_init(sig);
- 903
- 904 task_lock(current->group_leader);
- 905 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
- 906 task_unlock(current->group_leader);
- 907
- 908 posix_cpu_timers_init_group(sig);
- 909
- 910 acct_init_pacct(&sig->pacct);
- 911
- 912 tty_audit_fork(sig);
- 913
- 914 sig->oom_adj = current->signal->oom_adj;
- 915
- 916 return 0;
- 917 }
1.如果创建线程,跳过
2.分配信号描述符
3.初始化信号描述符,引用计数器置1、清空共享信号pending
xiii.copy_mm
- /* kernel/fork.c */
- 681 static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
- 682 {
- 683 struct mm_struct * mm, *oldmm;
- 684 int retval;
- 685
- 686 tsk->min_flt = tsk->maj_flt = 0;
- 687 tsk->nvcsw = tsk->nivcsw = 0;
- 688 #ifdef CONFIG_DETECT_HUNG_TASK
- 689 tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
- 690 #endif
- 691
- 692 tsk->mm = NULL;
- 693 tsk->active_mm = NULL;
- 694
- 695 /*
- 696 * Are we cloning a kernel thread?
- 697 *
- 698 * We need to steal a active VM for that..
- 699 */
- 700 oldmm = current->mm;
- 701 if (!oldmm)
- 702 return 0;
- 703
- 704 if (clone_flags & CLONE_VM) {
- 705 atomic_inc(&oldmm->mm_users);
- 706 mm = oldmm;
- 707 goto good_mm;
- 708 }
- 709
- 710 retval = -ENOMEM;
- 711 mm = dup_mm(tsk);
- 712 if (!mm)
- 713 goto fail_nomem;
- 714
- 715 good_mm:
- 716 /* Initializing for Swap token stuff */
- 717 mm->token_priority = 0;
- 718 mm->last_interval = 0;
- 719
- 720 tsk->mm = mm;
- 721 tsk->active_mm = mm;
- 722 return 0;
- 723
- 724 fail_nomem:
- 725 return retval;
- 726 }
1.分配地址空间描述符mm_struct
2.复制父进程地址空间描述符信息
3.初始化子进程地址空间描述符,覆盖从父进程地址空间描述符那里复制的部分;如引用数、map读写信号等
4.可执行文件引用计数器加1,以便在父进程退出后可执行文件也不会关闭
5.复制地址空间的映射区
遍历地址空间段
A.分配地址空间段描述符
B.复制地址空间段描述符信息
C.初始化地址空间段描述符,如将描述符从父进程链表&红黑树中删除、指向子进程的地址空间描述符等
D.如果是文件映射;文件引用计数器加1等
E.将地址空间段描述符添加到描述符链表及描述符红黑树中
F.映射区计数加1
G.复制页表copy_page_range
修改可写页表项,添加页保护标志,当往该页写数据时产生缺页异常,缺页异常处理会为进程分配新的页帧,这就是COW(copy on write)技术,这样做即加速了fork过程并且又节省了内存;而只读页帧会在进程间共享,如代码段
dup_mm->dup_mmap->copy_page_range->copy_pud_range->copy_pmd_range->copy_pte_range->copy_one_pte->ptep_set_wrprotect&pte_wrprotect
H.回调地址空间的open函数
xiv.copy_namespaces
- /* kernel/nsproxy.c */
- 103 /*
- 104 * called from clone. This now handles copy for nsproxy and all
- 105 * namespaces therein.
- 106 */
- 107 int copy_namespaces(unsigned long flags, struct task_struct *tsk)
- 108 {
- 109 struct nsproxy *old_ns = tsk->nsproxy;
- 110 struct nsproxy *new_ns;
- 111 int err = 0;
- 112
- 113 if (!old_ns)
- 114 return 0;
- 115
- 116 get_nsproxy(old_ns);
- 117
- 118 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- 119 CLONE_NEWPID | CLONE_NEWNET)))
- 120 return 0;
- 121
- 122 if (!capable(CAP_SYS_ADMIN)) {
- 123 err = -EPERM;
- 124 goto out;
- 125 }
- 126
- 127 /*
- 128 * CLONE_NEWIPC must detach from the undolist: after switching
- 129 * to a new ipc namespace, the semaphore arrays from the old
- 130 * namespace are unreachable. In clone parlance, CLONE_SYSVSEM
- 131 * means share undolist with parent, so we must forbid using
- 132 * it along with CLONE_NEWIPC.
- 133 */
- 134 if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) {
- 135 err = -EINVAL;
- 136 goto out;
- 137 }
- 138
- 139 new_ns = create_new_namespaces(flags, tsk, tsk->fs);
- 140 if (IS_ERR(new_ns)) {
- 141 err = PTR_ERR(new_ns);
- 142 goto out;
- 143 }
- 144
- 145 tsk->nsproxy = new_ns;
- 146
- 147 out:
- 148 put_nsproxy(old_ns);
- 149 return err;
- 150 }
1.如果clone_flags没有使用新命名空间标识,直接返回
2.根据clone_flags中使用新命名空间标识,创建新的命名空间;修改子进程的命名空间nsproxy为新命名空间
xv.复制子进程运行环境copy_thread
- /* arch/x86/kernel/process_32.c */
- 242 int copy_thread(unsigned long clone_flags, unsigned long sp,
- 243 unsigned long unused,
- 244 struct task_struct *p, struct pt_regs *regs)
- 245 {
- 246 struct pt_regs *childregs;
- 247 struct task_struct *tsk;
- 248 int err;
- 249
- 250 childregs = task_pt_regs(p);
- 251 *childregs = *regs;
- 252 childregs->ax = 0;
- 253 childregs->sp = sp;
- 254
- 255 p->thread.sp = (unsigned long) childregs;
- 256 p->thread.sp0 = (unsigned long) (childregs+1);
- 257
- 258 p->thread.ip = (unsigned long) ret_from_fork;
- 259
- 260 task_user_gs(p) = get_user_gs(regs);
- 261
- 262 tsk = current;
- 263 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
- 264 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
- 265 IO_BITMAP_BYTES, GFP_KERNEL);
- 266 if (!p->thread.io_bitmap_ptr) {
- 267 p->thread.io_bitmap_max = 0;
- 268 return -ENOMEM;
- 269 }
- 270 set_tsk_thread_flag(p, TIF_IO_BITMAP);
- 271 }
- 272
- 273 err = 0;
- 274
- 275 /*
- 276 * Set a new TLS for the child thread?
- 277 */
- 278 if (clone_flags & CLONE_SETTLS)
- 279 err = do_set_thread_area(p, -1,
- 280 (struct user_desc __user *)childregs->si, 0);
- 281
- 282 if (err && p->thread.io_bitmap_ptr) {
- 283 kfree(p->thread.io_bitmap_ptr);
- 284 p->thread.io_bitmap_max = 0;
- 285 }
- 286
- 287 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
- 288 p->thread.ds_ctx = NULL;
- 289
- 290 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
- 291 p->thread.debugctlmsr = 0;
- 292
- 293 return err;
- 294 }
1.将父进程内核堆栈中保存的寄存器值,复制到子进程的内核堆栈中;以便子进程在fork返回时与原进程状态一致
2.设置子进程fork/vfork/clone返回值为0(系统调用返回值放在ax中)
3.设置子进程用户空间的堆栈;fork、vfork的父进程与子进程的堆栈值一样,因为他们使用不同的进程地址空间,所以不会存在冲突;
4.设置子进程的内核堆栈,因为子进程直接从fork返回,所以将栈顶设置成保存寄存器的起始位置
5.设置子进程被调用后从ret_from_fork开始执行
dup_task_struct及copy_thread之后进程描述符及内核堆栈,如下图所示:
xvi.alloc_pid
- /* kernel/pid.c */
- 245 struct pid *alloc_pid(struct pid_namespace *ns)
- 246 {
- 247 struct pid *pid;
- 248 enum pid_type type;
- 249 int i, nr;
- 250 struct pid_namespace *tmp;
- 251 struct upid *upid;
- 252
- 253 pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
- 254 if (!pid)
- 255 goto out;
- 256
- 257 tmp = ns;
- 258 for (i = ns->level; i >= 0; i--) {
- 259 nr = alloc_pidmap(tmp);
- 260 if (nr < 0)
- 261 goto out_free;
- 262
- 263 pid->numbers[i].nr = nr;
- 264 pid->numbers[i].ns = tmp;
- 265 tmp = tmp->parent;
- 266 }
- 267
- 268 get_pid_ns(ns);
- 269 pid->level = ns->level;
- 270 atomic_set(&pid->count, 1);
- 271 for (type = 0; type < PIDTYPE_MAX; ++type)
- 272 INIT_HLIST_HEAD(&pid->tasks[type]);
- 273
- 274 spin_lock_irq(&pidmap_lock);
- 275 for (i = ns->level; i >= 0; i--) {
- 276 upid = &pid->numbers[i];
- 277 hlist_add_head_rcu(&upid->pid_chain,
- 278 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
- 279 }
- 280 spin_unlock_irq(&pidmap_lock);
- 281
- 282 out:
- 283 return pid;
- 284
- 285 out_free:
- 286 while (++i <= ns->level)
- 287 free_pidmap(pid->numbers + i);
- 288
- 289 kmem_cache_free(ns->pid_cachep, pid);
- 290 pid = NULL;
- 291 goto out;
- 292 }
1.分配pid,从pidmap中查找空闲id,并赋给pid
2.将pid链入pid哈希表中,方便根据id打到进程描述符
pid分配如下图所示:
xvii.attach_pid
- /* kernel/pid.c */
- 315 /*
- 316 * attach_pid() must be called with the tasklist_lock write-held.
- 317 */
- 318 void attach_pid(struct task_struct *task, enum pid_type type,
- 319 struct pid *pid)
- 320 {
- 321 struct pid_link *link;
- 322
- 323 link = &task->pids[type];
- 324 link->pid = pid;
- 325 hlist_add_head_rcu(&link->node, &pid->tasks[type]);
- 326 }
1.取进程描述符中类型为type的pid_link
2.将link->pid置成pid
3.将link链入相应的pid中
下图表示进程添加到相应的进程、进程组、会话组后的关系:
|