register | high | low |
---|---|---|
eax | 0xaa55 | 0xaa55 |
ebx | 0x0 | 0 |
ecx | 0x0 | 0 |
process透過system call進入kernel,執行完後return,因此process會再user space和kernel space交互執行。kernel有保護機制,當process執行system call時,需要硬體提升權限才能執行kernel裡的功能。
shell
shell是一個接收使用者命令並執行的程序,非kernel的一部份。
xv6’s boot loader將kernel從硬碟載入後,接著設定page table,將virtual addresses 0x0和0x80000000(KERNBASE)映射到physical addresses 0x0,由於%cr4的PSE位置為1,所以現在分頁大小為4Mbytes,最後將entrypgdir的physical memory載入%cr3,%cr0的PG設置為1,啟動分頁再跳轉至main。
在main初始化後,會呼叫userinit來創建第一個process,userinit會呼叫allocproc,配置struct proc並初始化,透過特殊設計讓thread首先在forkret運行,然後回到trapret,回復暫存器後跳轉至process。
第一個process會先執行initcode.S,得到最初的user memory內容,接著執行setupkvm和inituvm,最後userinit設定trap frame為最初user mode state。
待所有初始化完成後,userinit將p->state設置為RUNNABLE,使process可以被調度。
OS透過page tables來達成對記憶體的控制,讓不同process的address spaces能映射到physical memory上,同時保護各個process的記憶體。x86指令都是用virtual addresses計算,所以透過維護page table,將每個virtual address映射到physical address上。
一個x86 page table包含2^20(1,048,576)page table entries(PTEs),每個PTE包含20-bit physical page number(PPN)和額外的flags。paging hardware透過前20bits來找到page table的index,將前20bits替換為PTE的PPN,剩餘12bits保持原狀,得到physical address位置,而一個page大小即為2^12(4096)bytes。
實際上,page table是一個存放在physical memory的two-level tree。root是一個4096-byte page directory,裡面包含1024個類似PTE的參考,分別指向page table pages,而每個page table又包含1024個32-bit PTE。透過前10bits找到page directory entry,接著透過其後10bits找到對應PTE。
每個PTE的flags代表其使用權限和狀態
相關flags定義
//mmu.h
#define PTE_P 0x001 // Present
#define PTE_W 0x002 // Writeable
#define PTE_U 0x004 // User
#define PTE_PWT 0x008 // Write−Through
#define PTE_PCD 0x010 // Cache−Disable
#define PTE_A 0x020 // Accessed
#define PTE_D 0x040 // Dirty
#define PTE_PS 0x080 // Page Size
#define PTE_MBZ 0x180 // Bits must be zero
//memlayout.h
#define EXTMEM 0x100000 // Start of extended memory
#define PHYSTOP 0xE000000 // Top physical memory
#define DEVSPACE 0xFE000000 // Other devices are at high addresses
#define KERNBASE 0x80000000 // First kernel virtual address
#define KERNLINK (KERNBASE+EXTMEM) // Address where kernel is linked
...
由於KERNBASE限制,每一個process只能使用2GB memory。
//main.c
int main(void)
{
kinit1(end, P2V(4*1024*1024)); // phys page allocator
kvmalloc(); // kernel page table
//中略...
kinit2(P2V(4*1024*1024), P2V(PHYSTOP)); // must come after startothers()
}
Physical memory allocator
一開始受限於entrypgdir規定,main呼叫kinit1分配了4MB大小的page,待完整page table建立出來後,再呼叫kinit2初始化剩餘部分。
//kalloc.c
// 1. main() calls kinit1() while still using entrypgdir to place just
// the pages mapped by entrypgdir on free list.
// 2. main() calls kinit2() with the rest of the physical pages
// after installing a full page table that maps them on all cores.
void kinit1(void *vstart, void *vend)
{
initlock(&kmem.lock, "kmem");
kmem.use_lock = 0;
freerange(vstart, vend); // add memory to the free list via per-page
}
void kinit2(void *vstart, void *vend)
{
freerange(vstart, vend);
kmem.use_lock = 1;
}
資料結構
透過linked list方式將free page串起來
//kalloc.c
struct run {
struct run *next;
};
struct {
struct spinlock lock;
int use_lock;
struct run *freelist;
} kmem;
程式實作
freerange會透過對每一個page呼叫kfree將memory加入free-list中,而kalloc則會移除並return free-list第一個page。
//kalloc.c
void freerange(void *vstart, void *vend)
{
char *p;
p = (char*)PGROUNDUP((uint)vstart);
for(; p + PGSIZE <= (char*)vend; p += PGSIZE)
kfree(p);
}
// Free the page of physical memory pointed at by v
void kfree(char *v)
{
struct run *r;
//v是否對齊page size、超出最大memory等
if((uint)v % PGSIZE || v < end || V2P(v) >= PHYSTOP)
panic("kfree");
// Fill with junk to catch dangling refs.
memset(v, 1, PGSIZE);
if(kmem.use_lock)
acquire(&kmem.lock);
r = (struct run*)v;
r−>next = kmem.freelist; // 將v和當前空閒free-list連接起來
kmem.freelist = r; // 更新free-list
if(kmem.use_lock)
release(&kmem.lock);
}
// Allocate one 4096−byte page of physical memory.
// Returns a pointer that the kernel can use.
// Returns 0 if the memory cannot be allocated.
char* kalloc(void)
{
struct run *r;
if(kmem.use_lock)
acquire(&kmem.lock);
r = kmem.freelist;
if(r)
kmem.freelist = r−>next; //移除first element並更新free-list
if(kmem.use_lock)
release(&kmem.lock);
return (char*)r;
}
建立address space
kvmalloc會建立並切換至kernel所需的page table,呼叫setupkvm後,setupkvm會呼叫kalloc建立page directory,再透過mappages建立mapping。
資料結構
kmap將kernel的virtual address切成4塊,分別對應到physical address。
//vm.c
static struct kmap {
void *virt; // virtual memory start address
uint phys_start; //physical memory start address
uint phys_end; // physical memory end address
int perm; // 使用權限
} kmap[] = {
{ (void*)KERNBASE, 0, EXTMEM, PTE_W}, // I/O space
{ (void*)KERNLINK, V2P(KERNLINK), V2P(data), 0}, // kern text+rodata
{ (void*)data, V2P(data), PHYSTOP, PTE_W}, // kern data+memory
{ (void*)DEVSPACE, DEVSPACE, 0, PTE_W}, // more devices
};
vitual address的結構和轉換方法
//vm.c
// A virtual address ’la’ has a three−part structure as follows:
//
// +−−−−−−−−10−−−−−−+−−−−−−−10−−−−−−−+−−−−−−−−−12−−−−−−−−−−+
// | Page Directory | Page Table | Offset within Page |
// | Index | Index | |
// +−−−−−−−−−−−−−−−−+−−−−−−−−−−−−−−−−+−−−−−−−−−−−−−−−−−−−−−+
// \−−− PDX(va) −−/ \−−− PTX(va) −−/
// page directory index
#define PDX(va) (((uint)(va) >> PDXSHIFT) & 0x3FF)
// page table index
#define PTX(va) (((uint)(va) >> PTXSHIFT) & 0x3FF)
#define PGSHIFT 12 // log2(PGSIZE)
#define PTXSHIFT 12 // offset of PTX in a linear address
#define PDXSHIFT 22 // offset of PDX in a linear address
// Address in page table or page directory entry
#define PTE_ADDR(pte) ((uint)(pte) & ~0xFFF)
#define PGROUNDUP(sz) (((sz)+PGSIZE−1) & ~(PGSIZE−1))
#define PGROUNDDOWN(a) (((a)) & ~(PGSIZE−1))
程式實作
//vm.c
// create and switch to a page table with the mappings
// above KERNBASE required for the kernel to run
void kvmalloc(void)
{
kpgdir = setupkvm();
switchkvm();
}
// Set up kernel part of a page table.
pde_t* setupkvm(void)
{
pde_t *pgdir; // page directory entry
struct kmap *k;
// 分配空間給page directory,無法分配時return 0
if((pgdir = (pde_t*)kalloc()) == 0)
return 0;
memset(pgdir, 0, PGSIZE);
if (P2V(PHYSTOP) > (void*)DEVSPACE)
panic("PHYSTOP too high");
// 呼叫mapppages建立 virtual address到physical address的映射
for(k = kmap; k < &kmap[NELEM(kmap)]; k++)
if(mappages(pgdir, k−>virt, k−>phys_end − k−>phys_start,
(uint)k−>phys_start, k−>perm) < 0) {
freevm(pgdir);
return 0;
}
return pgdir;
}
// 建立從va(virtual addresses)映射到pa(physical addresses)的PTEs,
// va的大小不一定page-aligned
static int mappages(pde_t *pgdir, void *va, uint size, uint pa, int perm)
{
char *a, *last;
pte_t *pte;
a = (char*)PGROUNDDOWN((uint)va);
last = (char*)PGROUNDDOWN(((uint)va) + size − 1);
//將[a, last]切成許多page,分別映射到physical memory上
for(;;) {
if((pte = walkpgdir(pgdir, a, 1)) == 0)
return −1;
if(*pte & PTE_P)
panic("remap");
*pte = pa | perm | PTE_P; //初始化PTE標誌,保存指向的physical memory address
if(a == last)
break;
//mapping next page
a += PGSIZE;
pa += PGSIZE;
}
return 0;
}
static pte_t* walkpgdir(pde_t *pgdir, const void *va, int alloc)
{
pde_t *pde; // page directory entry
pte_t *pgtab; // page table
// 透過va前10bit找到page directory entry
pde = &pgdir[PDX(va)];
//page directory entry存在
if(*pde & PTE_P) {
pgtab = (pte_t*)P2V(PTE_ADDR(*pde));
} else {
if(!alloc || (pgtab = (pte_t*)kalloc()) == 0)
return 0;
// Make sure all those PTE_P bits are zero.
memset(pgtab, 0, PGSIZE);
//physical address放入pde中,並設定權限
*pde = V2P(pgtab) | PTE_P | PTE_W | PTE_U;
}
return &pgtab[PTX(va)];
}
上圖顯示了xv6中運行的user process結構,其中的stack是single page,存放著exec創建的初始數據,而最上方儲存command-line arguments,以及指向這些參數的pointer。為了保護stack不會超出stack page,xv6在stack下方設置了沒有映射的guard page,以確保stack超出範圍時,硬體會丟出exception。
heap可以透過system call sbrk來增加,sbrk是由growproc來實作。
//proc.c
// Grow current process’s memory by n bytes.
// Return 0 on success, −1 on failure.
int growproc(int n)
{
uint sz;
struct proc *curproc = myproc();
sz = curproc−>sz;
if(n > 0){
if((sz = allocuvm(curproc−>pgdir, sz, sz + n)) == 0)
return −1;
} else if(n < 0){
if((sz = deallocuvm(curproc−>pgdir, sz, sz + n)) == 0)
return −1;
}
curproc−>sz = sz;
switchuvm(curproc);
return 0;
}
system call exec會讀取一份文件來初始化user part of an address space,打開文件後會檢查是否有ELF binary,接著透過setupkvm建立page table,之後allocuvm替每個ELF segment分配memory,呼叫loaduvm載入到memory,最後設定stack。
資料結構
//elf.h
#define ELF_MAGIC 0x464C457FU // "\x7FELF" in little endian
// file header
struct elfhdr {
uint magic; // must equal ELF_MAGIC
uchar elf[12];
ushort type;
ushort machine;
uint version;
uint entry;
uint phoff;
uint shoff;
uint flags;
ushort ehsize;
ushort phentsize;
ushort phnum;
ushort shentsize;
ushort shnum;
ushort shstrndx;
};
// program section header
struct proghdr {
uint type;
uint off;
uint vaddr;
uint paddr;
uint filesz;
uint memsz;
uint flags;
uint align;
};
程式實作
int exec(char *path, char **argv)
{
char *s, *last;
int i, off;
uint argc, sz, sp, ustack[3+MAXARG+1];
struct elfhdr elf;
struct inode *ip;
struct proghdr ph;
pde_t *pgdir, *oldpgdir;
struct proc *curproc = myproc();
begin_op();
//打開對應路徑的文件
if((ip = namei(path)) == 0){
end_op();
cprintf("exec: fail\n");
return -1;
}
ilock(ip);
pgdir = 0;
// 檢查ELF header
if(readi(ip, (char*)&elf, 0, sizeof(elf)) != sizeof(elf))
goto bad;
if(elf.magic != ELF_MAGIC)
goto bad;
//分配新的page table
if((pgdir = setupkvm()) == 0)
goto bad;
// 加載程式到記憶體
sz = 0;
for(i = 0, off = elf.phoff; i < elf.phnum; i++, off += sizeof(ph)){
if(readi(ip, (char*)&ph, off, sizeof(ph)) != sizeof(ph))
goto bad;
if(ph.type != ELF_PROG_LOAD)
continue;
if(ph.memsz < ph.filesz)
goto bad;
if(ph.vaddr + ph.memsz < ph.vaddr)
goto bad;
if((sz = allocuvm(pgdir, sz, ph.vaddr + ph.memsz)) == 0)
goto bad;
if(ph.vaddr % PGSIZE != 0)
goto bad;
if(loaduvm(pgdir, (char*)ph.vaddr, ip, ph.off, ph.filesz) < 0)
goto bad;
}
iunlockput(ip);
end_op();
ip = 0;
// 在下一個page的分界線分配兩個page
sz = PGROUNDUP(sz);
if((sz = allocuvm(pgdir, sz, sz + 2*PGSIZE)) == 0)
goto bad;
clearpteu(pgdir, (char*)(sz - 2*PGSIZE));
sp = sz;
// 複製參數到stack中
for(argc = 0; argv[argc]; argc++) {
// 參數太多
if(argc >= MAXARG)
goto bad;
sp = (sp - (strlen(argv[argc]) + 1)) & ~3;
// copyout將參數複製到stack會檢查目標page是否可以存取,不行會return -1
if(copyout(pgdir, sp, argv[argc], strlen(argv[argc]) + 1) < 0)
goto bad;
ustack[3+argc] = sp;
}
ustack[3+argc] = 0;
ustack[0] = 0xffffffff; // fake return PC
ustack[1] = argc;
ustack[2] = sp - (argc + 1) * 4; // argv pointer
sp -= (3+argc+1) * 4;
if(copyout(pgdir, sp, ustack, (3+argc+1)*4) < 0)
goto bad;
//略...
// Commit to the user image
oldpgdir = curproc->pgdir;
curproc->pgdir = pgdir;
curproc->sz = sz;
curproc->tf->eip = elf.entry; // main
curproc->tf->esp = sp;
switchuvm(curproc);
freevm(oldpgdir);
return 0;
//detects an error, frees the new image, and returns –1
bad:
if(pgdir)
freevm(pgdir);
if(ip) {
iunlockput(ip);
end_op();
}
return -1;
}