The fork function finds a task number (pid) from the current task table. If it finds the task, it copies the current struct (task_struct), and then copies the page entry, setting RW to 0, in preparation for future write replication. The child process shares memory with its parent. And then we process the signal. After a process switch, the CPU automatically loads the TSS data in each task_struct and saves the CPU state of the previous process into the TSS. After the process forks, it is ready to be scheduled. Note that the child process initializes by putting 0 into the eAX register, which is used to return the function value. That is, the child will return 0 and the parent will return its own PID.

Definition of fork function

init/main.c

static inline _syscall0(int,fork)
Copy the code

include/unistd.h

#define __NR_fork 2 #define _syscall0(type,name) \ type name(void) \ { \ long __res; \ __asm__ volatile ("int $0x80" \ : "=a" (__res) \ : "0" (__NR_##name)); \ if (__res >= 0) \ return (type) __res; \ errno = -__res; \ return -1; The \}Copy the code

The macro expands

int fork(void) { long __res; __asm__ volatile ("int $0x80" \ : "=a" (__res) \ : "0" (__NR_fork)); \ if (__res >= 0) \ return (int) __res; \ errno = -__res; \ return -1; The \}Copy the code

“0” (__NR_fork)) indicates that we use the previous constraint, which is to put the value 2 of __NR_fork into the EAX register. =a” (__res) indicates that __res is bound to eax. The return result of the function will be in eAX, which is in __res.

The system_call function is invoked after an interrupt with int 0x80, and the system_call function is retrieved from the system call table sys_call_table based on the function index passed in.

Kernel /sched.c Sets a handler for 0x80 interrupt in sched.c file.

void sched_init(void) { ... set_system_gate(0x80,&system_call); . }Copy the code

include/linux/sys.h

fn_ptr sys_call_table[] = { sys_setup, sys_exit, sys_fork,... };

Copy the code

Nr_system_calls = 72 # Total system calls in the Linux version 0.11 kernel.

system_call: CMPL $nr_system_calls-1,%eax # set -1 in eAX if out of range Return -1 push %ds # Save the original segment register value push %es push %fs pushl %edx # ebx,ecx,edx in the system call corresponding C language function call parameters. Pushl %ecx pusHL % ebX MOVL $0x10,%edx # 0x10 Mov %dx,%ds mov %dx,%es movl $0x17,%edx # 0001 0111, RPL=3, TI=1, index=2, Mov %dx,%fs call sys_call_table(,%eax,4) If not in the ready state (state! = 0) to execute the scheduler. If the # task is ready, but its time slice has run out (counter = 0), then the scheduler is also executed. For example, when the # process in the background process group performs read/write operations on the control terminal, all the processes in the background process group receive SIGTTIN or SIGTTOU # signals by default, causing all processes in the process group to stop. The current process will return immediately. Eax CMPL $0,state(%eax) # state Jne reschedule CMPL $0,counter(% eAX) # counter jne reschedule CMPL $0,counter(% eAX) # counterCopy the code
.align 2 sys_fork: call find_empty_process testl %eax,%eax # Exit if a negative number is returned. Js 1f # jump forward to 1, return push %gs pushl %esi pushl %edi pushl %ebp pushl %eax call copy_Process addl $20,%esp # drop all contents of the stack. 1: retCopy the code

kernel/sched.c

struct task_struct * task[NR_TASKS] = {&(init_task.task), }; // define the task pointer array NR_TASKS =64Copy the code

Find_empty_process Retrives an array index, the process number, from the global task list.

kernel/fork.c

long last_pid=0; int find_empty_process(void) { int i; repeat: if ((++last_pid)<0) last_pid=1; for(i=0 ; i<NR_TASKS ; i++) if (task[i] && task[i]->pid == last_pid) goto repeat; for(i=1 ; i<NR_TASKS ; I++) // task 0 is excluded if (! task[i]) return i; return -EAGAIN; }Copy the code
int copy_process(int nr,long ebp,long edi,long esi,long gs,long none, long ebx,long ecx,long edx, long fs,long es,long ds, long eip,long cs,long eflags,long esp,long ss) { struct task_struct *p; int i; struct file *f; p = (struct task_struct *) get_free_page(); if (! p) return -EAGAIN; task[nr] = p; *p = *current; /* The structure data of the current process is copied to p */ p->state = TASK_UNINTERRUPTIBLE; P ->pid = last_pid; p->pid = last_pid; // New process id. p->father = current->pid; // set the parent process p->counter = p->priority; P ->signal = 0; P ->alarm = 0; P ->leader = 0; /* process leadership doesn't inherit */ p->utime = p->stime = 0; P ->cutime = p->cstime = 0; P ->start_time = jiffies; P ->tss.back_link = 0; p->tss.back_link = 0; p->tss.esp0 = PAGE_SIZE + (long) p; // Task kernel stack pointer. P ->tss.ss0 = 0x10; (PAGE_SIZE+(long)p) esp0 = 0x10; // Kernel stack segment selector p->tss.eip = eip; P ->tss.eflags = eflags; // register p->tss.eax = 0; // This is why the new process returns 0 when fork() returns p->tss.ecx = ecx; p->tss.edx = edx; p->tss.ebx = ebx; p->tss.esp = esp; p->tss.ebp = ebp; p->tss.esi = esi; p->tss.edi = edi; p->tss.es = es & 0xffff; // Segment register only 16 bits valid p->tss.cs = cs&0xFFFF; p->tss.ss = ss & 0xffff; p->tss.ds = ds & 0xffff; p->tss.fs = fs & 0xffff; p->tss.gs = gs & 0xffff; p->tss.ldt = _LDT(nr); // task local table descriptor selector (LDT descriptor in GDT) p-> tsS. trace_bitmap = 0x80000000; If (last_task_used_math == current) __asm__(" CLTS; fnsave %0"::"m" (p->tss.i387)); // Next copy the process page table. That is, set the base address and limit length in the new task code segment and data segment descriptors in the linear address space, // and copy the page table. If there is an error (the return value is not 0), the corresponding item in the task array is reset and the memory page allocated for the // task structure for the new task is freed. if (copy_mem(nr,p)) { task[nr] = NULL; free_page((long) p); return -EAGAIN; } // If there are open files in the parent process, increase the number of open files by 1, because the child process will share these open files with the parent process. Increase the PWD, root, and executable references of the current process (parent) by 1. for (i=0; i<NR_OPEN; i++) if ((f=p->filp[i])) f->f_count++; if (current->pwd) current->pwd->i_count++; if (current->root) current->root->i_count++; if (current->executable) current->executable->i_count++; set_tss_desc(gdt+(nr<<1)+FIRST_TSS_ENTRY,&(p->tss)); set_ldt_desc(gdt+(nr<<1)+FIRST_LDT_ENTRY,&(p->ldt)); p->state = TASK_RUNNING; /* Task ready, waiting for scheduling */ return last_pid; }Copy the code

Get_free_page function analysis is available here

Because Linux uses copy on Write technology, COPY_mem only sets its own page directory entries and page entries for the new process and does not actually allocate physical memory pages for the new process. The new process shares all memory pages with its parent. The system sets the maximum number of segment descriptors in the GLOBAL descriptor table GDT to 256, of which two are idle, two are used by the system, and two are used by each process. Therefore, the system can hold a maximum of (256-4)/2 +1 =127 tasks, and the virtual address range is (256-4)/2 x 64MB =4 gb. 4G is exactly the same as the linear or physical address space of the CPU, so it’s easy to confuse the three addresses in the 0.11 kernel, and the way memory space is used has changed since the Linux kernel version 0.99. Each process can enjoy the entire 4G address space range independently.

kernel/fork.c

int copy_mem(int nr,struct task_struct * p) { unsigned long old_data_base,new_data_base,data_limit; unsigned long old_code_base,new_code_base,code_limit; code_limit=get_limit(0x0f); Data_limit =get_limit(0x17); // RPL=3 TI=1; Old_code_base = get_base(current-> LDT [1]); old_code_base = get_base(current-> LDT [1]) Old_data_base = get_base(current-> LDT [2]); If (old_data_base! = old_code_base) panic("We don't support separate I&D"); if (data_limit < code_limit) panic("Bad data_limit"); new_data_base = new_code_base = nr * 0x4000000; // New base = task number *64Mb(task size). p->start_code = new_code_base; set_base(p->ldt[1],new_code_base); set_base(p->ldt[2],new_data_base); // Set the page directory entry and page entry for the new process. If (copy_page_tables(old_data_BASE,new_data_base,data_limit)) {printk("free_page_tables: from copy_mem\n"); free_page_tables(new_data_base,data_limit); return -ENOMEM; } return 0; }Copy the code

include/linux/sched.h

#define get_limit(segment) ({ \ unsigned long __limit; \ __asm__("lsll %1,%0\n\tincl %0":"=r" (__limit):"r" (segment)); \ __limit; }) #define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) ) #define get_base(ldt) _get_base( ((char *)&(ldt)) ) #define _set_base(addr,base) \ __asm__ ("push %%edx\n\t" \ "movw %%dx,%0\n\t" \ "rorl $16,%%edx\n\t" \ "movb %%dl,%1\n\t" \ "movb %%dh,%2\n\t" \ "pop %%edx" \ ::"m" (*((addr)+2)),  \ "m" (*((addr)+4)), \ "m" (*((addr)+7)), Static inline unsigned long _get_base(char * addr) {unsigned long __base; static inline unsigned long _get_base(char * addr) {unsigned long __base; __asm__("movb %3,%%dh\n\t" "movb %2,%%dl\n\t" "shll $16,%%edx\n\t" "movw %1,%%dx" :"=&d" (__base) :"m" (*((addr)+2)), "m" (*((addr)+4)), "m" (*((addr)+7))); return __base; }Copy the code

The LSL directive is short for Load Segment Limit. It takes the scattered length limit bits from the specified selectors and assembles them into the specified register. The resulting segment length is the actual number of bytes minus 1, so you need to add 1 here before returning.

Movb %3,%%dh Take the higher 8 bits (bit 31-24) of the base address at [addr+7] -> dh

“Movb %2,%%dl Take the lower 8 bits (bit 23-16) of the 16 bits above the base address at [addr+4] -> dh

SHLL $16,%% EDX base address height 16 moved to 16 positions in edX

Movw %1,%%dx = 16 bits lower than the base address at [addr+2] -> dx

mm/memory.c

int copy_page_tables(unsigned long from,unsigned long to,long size) { unsigned long * from_page_table; unsigned long * to_page_table; unsigned long this_page; unsigned long * from_dir, * to_dir; unsigned long nr; If ((from & 0 x3fffff) | | (to & 0 x3fffff)) / / 4 MB on the border, because a page can manage 4 MB of memory panic (" copy_page_tables called with wrong alignment "); From_dir = (unsigned long *) ((from>>20) & 0xffc); /* _pg_dir = 0 */ to_dir = (unsigned long *) ((to>>20) & 0xffc); Size = ((unsigned) (size+0x3fffff)) >> 22; // Copy each occupied page table for(; size-->0 ; From_dir ++,to_dir++) {if (1 & *to_dir) // If the page table specified by the destination directory entry already exists (P=1), panic(" COPY_page_tables: already exist"); if (! (1 & *from_dir)) // If this source directory entry is not in use, do not copy the corresponding page table, skip continue; From_page_table from_page_table = (unsigned long *) (0xffffF000&* from_dir); if (! (to_page_table = (unsigned long *) get_free_page())) return -1; /* Out of memory, see freeing */ // Set destination directory item information. 7 is marking information, said (Usr, R/W, the Present) * to_dir = ((unsigned long) to_page_table) | 7; // Set the number of pages to be copied for the page table being processed. If in kernel space, only the first 160 pages (640KB) need to be copied, otherwise all 1024 pages in a page table need to be copied nr = (from==0)? 0xA0:1024; // For the current page table, start copying a specified number of nr memory pages. for ( ; nr-- > 0 ; from_page_table++,to_page_table++) { this_page = *from_page_table; if (! (1 & this_page)) // If the current source page is not in use, do not copy continue; this_page &= ~2; // Reset the R/W flag in the page entry (set 0). *to_page_table = this_page; *to_page_table = this_page; // If the page is larger than 1MB, set mem_map[], then calculate // the page number, and use it as the index to increase the number of references in the corresponding item of the page mapping array. For pages below 1MB, // is a kernel page, so mem_map[] does not need to be set. If (this_page > LOW_MEM) {*from_page_table = this_page; This_page -= LOW_MEM; this_page -= LOW_MEM; this_page >>= 12; mem_map[this_page]++; } } } invalidate(); // refresh page transform buffer return 0; }Copy the code