B10515020
四資工三
葉敏宣
作業系統的工作有下列 4 項:提供比單獨由硬體所提供更有用的服務、管理底層的硬體、使多個程式在電腦上同時運行、共享資源、共同工作。
作業系統透過 interface,讓程式可以使用這些服務,而所謂的 interface 就是作業系統提供的一系列 system call。
Figure 0-1. A kernel and two user processes.
在xv6中,kernel 是一個負責提供上述服務的程式。process 需要透過 system call 來使用這些 kernel 中的服務。這樣的機制讓每一個 process 只能存取自己 memory、data。
一個作業系統必須要達到下列的需求:多工、隔離、互動。
在這一章中,透過 xv6 中第一個運行的 process 來說明 xv6 是如何運行、如何達到上述所說的需求。
process 讓程式可以假設自己獨佔一台電腦,擁有自己的 memory、CPU,藉此達成前段所說"隔離"的需求。
xv6 使用 page tables 讓每個 process 擁有自己的 address space,透過每個 process 的 page table 將 virtual address mapping 到 physical address。
page table 的功用:
// Page table/directory entry flags.
#define PTE_P 0x001 // Present
#define PTE_W 0x002 // Writeable
#define PTE_U 0x004 // User
#define PTE_PWT 0x008 // Write−Through
#define PTE_PCD 0x010 // Cache−Disable
#define PTE_A 0x020 // Accessed
#define PTE_D 0x040 // Dirty
#define PTE_PS 0x080 // Page Size
#define PTE_MBZ 0x180 // Bits must be zero
// Memory layout
#define EXTMEM 0x100000 // Start of extended memory
#define PHYSTOP 0xE000000 // Top physical memory
#define DEVSPACE 0xFE000000 // Other devices are at high addresses
// Key addresses for address space layout (see kmap in vm.c for layout)
#define KERNBASE 0x80000000 // First kernel virtual address
#define KERNLINK (KERNBASE+EXTMEM) // Address where kernel is linked
#ifndef __ASSEMBLER__
static inline uint v2p(void *a) { return (uint) a − KERNBASE; }
static inline void *p2v(uint a) { return (void *) a + KERNBASE; }
#endif
#define V2P(a) ((uint) a − KERNBASE)
#define P2V(a) ((void *) a + KERNBASE)
#define V2P_WO(x) ((x) − KERNBASE) // same as V2P, but without casts
#define P2V_WO(x) ((x) + KERNBASE) // same as V2P, but without casts
// Set up kernel part of a page table.
pde_t* setupkvm(char* (*alloc)(void)) {
pde_t *pgdir;
struct kmap *k;
if ((pgdir = (pde_t*)alloc()) == 0)
return 0;
memset(pgdir, 0, PGSIZE);
if (p2v(PHYSTOP) > (void*)DEVSPACE)
panic("PHYSTOP too high");
for (k = kmap; k < &kmap[NELEM(kmap)]; k++)
if (mappages(pgdir, k−>virt, k−>phys_end − k−>phys_start,
(uint)k−>phys_start, k−>perm, alloc) < 0)
return 0;
return pgdir;
}
// Allocate one page table for the machine for the kernel address
// space for scheduler processes.
void kvmalloc(void) {
kpgdir = setupkvm(enter_alloc);
switchkvm();
}
// Create PTEs for virtual addresses starting at va that refer to
// physical addresses starting at pa. va and size might not
// be page−aligned.
static int mappages(pde_t *pgdir, void *va, uint size, uint pa,
int perm, char* (*alloc)(void)) {
char *a, *last;
pte_t *pte;
a = (char*)PGROUNDDOWN((uint)va);
last = (char*)PGROUNDDOWN(((uint)va) + size − 1);
for (;;) {
if ((pte = walkpgdir(pgdir, a, alloc)) == 0)
return −1;
if (*pte & PTE_P)
panic("remap");
*pte = pa | perm | PTE_P;
if (a == last)
break;
a += PGSIZE;
pa += PGSIZE;
}
return 0;
}
// Return the address of the PTE in page table pgdir
// that corresponds to virtual address va. If alloc!=0,
// create any required page table pages.
static pte_t* walkpgdir(pde_t *pgdir, const void *va, char* (*alloc)(void)) {
pde_t *pde;
pte_t *pgtab;
pde = &pgdir[PDX(va)];
if (*pde & PTE_P) {
pgtab = (pte_t*)p2v(PTE_ADDR(*pde));
} else {
if (!alloc || (pgtab = (pte_t*)alloc()) == 0)
return 0;
// Make sure all those PTE_P bits are zero.
memset(pgtab, 0, PGSIZE);
// The permissions here are overly generous, but they can
// be further restricted by the permissions in the page table
// entries, if necessary.
*pde = v2p(pgtab) | PTE_P | PTE_W | PTE_U;
}
return &pgtab[PTX(va)];
}
struct run {
struct run *next;
};
struct {
struct spinlock lock;
int use_lock;
struct run *freelist;
} kmem;
// Bootstrap processor starts running C code here.
// Allocate a real stack and switch to it, first
// doing some setup required for memory allocator to work.
int main(void) {
kinit1(end, P2V(4*1024*1024)); // phys page allocator
kvmalloc(); // kernel page table
mpinit(); // detect other processors
lapicinit(); // interrupt controller
seginit(); // segment descriptors
picinit(); // disable pic
ioapicinit(); // another interrupt controller
consoleinit(); // console hardware
uartinit(); // serial port
pinit(); // process table
tvinit(); // trap vectors
binit(); // buffer cache
fileinit(); // file table
ideinit(); // disk
startothers(); // start other processors
kinit2(P2V(4*1024*1024), P2V(PHYSTOP));
// must come after startothers()
userinit(); // first user process
mpmain(); // finish this processor’s setup
}
// Initialization happens in two phases.
// 1. main() calls kinit1() while still using
// entrypgdir to place just
// the pages mapped by entrypgdir on free list.
// 2. main() calls kinit2() with the
// rest of the physical pages
// after installing a full page table that
// maps them on all cores.
void kinit1(void *vstart, void *vend) {
initlock(&kmem.lock, "kmem");
kmem.use_lock = 0;
freerange(vstart, vend);
}
void kinit2(void *vstart, void *vend) {
freerange(vstart, vend);
kmem.use_lock = 1;
}
// Free the page of physical memory pointed at by v,
// which normally should have been returned by a
// call to kalloc(). (The exception is when
// initializing the allocator; see kinit above.)
void kfree(char *v) {
struct run *r;
if ((uint)v % PGSIZE || v < end || V2P(v) >= PHYSTOP)
panic("kfree");
// Fill with junk to catch dangling refs.
memset(v, 1, PGSIZE);
if (kmem.use_lock)
acquire(&kmem.lock);
r = (struct run*)v;
r−>next = kmem.freelist;
kmem.freelist = r;
if (kmem.use_lock)
release(&kmem.lock);
}
int sys_sbrk(void) {
int addr;
int n;
if (argint(0, &n) < 0)
return −1;
addr = myproc()−>sz;
if (growproc(n) < 0)
return −1;
return addr;
}
// Grow current process’s memory by n bytes.
// Return 0 on success, −1 on failure
int growproc(int n) {
uint sz;
struct proc *curproc = myproc();
sz = curproc−>sz;
if (n > 0) {
if ((sz = allocuvm(curproc−>pgdir, sz, sz + n)) == 0)
return −1;
} else if(n < 0) {
if ((sz = deallocuvm(curproc−>pgdir, sz, sz + n)) == 0)
return −1;
}
curproc−>sz = sz;
switchuvm(curproc);
return 0;
}
// Allocate page tables and physical memory to
// grow process from oldsz to
// newsz, which need not be page aligned.
// Returns new size or 0 on error
int allocuvm(pde_t *pgdir, uint oldsz, uint newsz) {
char *mem;
uint a;
if (newsz >= KERNBASE)
return 0;
if (newsz < oldsz)
return oldsz;
a = PGROUNDUP(oldsz);
for (; a < newsz; a += PGSIZE) {
mem = kalloc();
if (mem == 0) {
cprintf("allocuvm out of memory\n");
deallocuvm(pgdir, newsz, oldsz);
return 0;
}
memset(mem, 0, PGSIZE);
if (mappages(pgdir, (char*)a, PGSIZE,
V2P(mem), PTE_W|PTE_U) < 0) {
cprintf("allocuvm out of memory (2)\n");
deallocuvm(pgdir, newsz, oldsz);
kfree(mem);
return 0;
}
}
return newsz;
}
// Deallocate user pages to bring the process size from oldsz to
// newsz. oldsz and newsz need not be page−aligned, nor does newsz
// need to be less than oldsz. oldsz can be larger than the actual
// process size. Returns the new process size.
int deallocuvm(pde_t *pgdir, uint oldsz, uint newsz) {
pte_t *pte;
uint a, pa;
if(newsz >= oldsz)
return oldsz;
a = PGROUNDUP(newsz);
for (; a < oldsz; a += PGSIZE) {
pte = walkpgdir(pgdir, (char*)a, 0);
if (!pte)
a = PGADDR(PDX(a) + 1, 0, 0) − PGSIZE;
else if ((*pte & PTE_P) != 0) {
pa = PTE_ADDR(*pte);
if (pa == 0)
panic("kfree");
char *v = P2V(pa);
kfree(v);
*pte = 0;
}
}
return newsz;
}
int sys_exec(void) {
char *path, *argv[MAXARG];
int i;
uint uargv, uarg;
if (argstr(0, &path) < 0 || argint(1, (int*)&uargv) < 0) {
return -1;
}
memset(argv, 0, sizeof(argv));
for (i=0;; i++) {
if (i >= NELEM(argv))
return −1;
if (fetchint(uargv+4*i, (int*)&uarg) < 0)
return −1;
if (uarg == 0) {
argv[i] = 0;
break;
}
if (fetchstr(uarg, &argv[i]) < 0)
return −1;
}
return exec(path, argv);
}
int exec(char *path, char **argv) {
char *s, *last;
int i, off;
uint argc, sz, sp, ustack[3+MAXARG+1];
struct elfhdr elf;
struct inode *ip;
struct proghdr ph;
pde_t *pgdir, *oldpgdir;
struct proc *curproc = myproc();
begin_op();
if ((ip = namei(path)) == 0){
end_op();
cprintf("exec: fail\n");
return −1;
}
ilock(ip);
pgdir = 0;
// Check ELF header
if (readi(ip, (char*)&elf, 0, sizeof(elf)) != sizeof(elf))
goto bad;
if (elf.magic != ELF_MAGIC)
goto bad;
if ((pgdir = setupkvm()) == 0)
goto bad;
// Load program into memory.
sz = 0;
for (i=0, off=elf.phoff; i<elf.phnum; i++, off+=sizeof(ph)) {
if (readi(ip, (char*)&ph, off, sizeof(ph)) != sizeof(ph))
goto bad;
if (ph.type != ELF_PROG_LOAD)
continue;
if (ph.memsz < ph.filesz)
goto bad;
if (ph.vaddr + ph.memsz < ph.vaddr)
goto bad;
if ((sz = allocuvm(pgdir, sz, ph.vaddr + ph.memsz)) == 0)
goto bad;
if (ph.vaddr % PGSIZE != 0)
goto bad;
if (loaduvm(pgdir, (char*)ph.vaddr,
ip, ph.off, ph.filesz) < 0)
goto bad;
}
iunlockput(ip);
end_op();
ip = 0;
// Allocate two pages at the next page boundary.
// Make the first inaccessible.
// Use the second as the user stack. sz = PGROUNDUP(sz);
if ((sz = allocuvm(pgdir, sz, sz + 2*PGSIZE)) == 0)
goto bad;
clearpteu(pgdir, (char*)(sz − 2*PGSIZE));
sp = sz;
// Push argument strings, prepare rest of stack in ustack.
for (argc = 0; argv[argc]; argc++) {
if (argc >= MAXARG)
goto bad;
sp = (sp − (strlen(argv[argc]) + 1)) & ~3;
if (copyout(pgdir, sp, argv[argc],
strlen(argv[argc]) + 1) < 0)
goto bad;
ustack[3+argc] = sp;
}
ustack[3+argc] = 0;
ustack[0] = 0xffffffff; // fake return PC
ustack[1] = argc;
ustack[2] = sp − (argc+1)*4; // argv pointer
sp −= (3+argc+1) * 4;
if (copyout(pgdir, sp, ustack, (3+argc+1)*4) < 0)
goto bad;
// Save program name for debugging.
for (last=s=path; *s; s++)
if(*s == ’/’)
last = s+1;
safestrcpy(curproc−>name, last, sizeof(curproc−>name));
// Commit to the user image.
oldpgdir = curproc−>pgdir;
curproc−>pgdir = pgdir;
curproc−>sz = sz;
curproc−>tf−>eip = elf.entry; // main
curproc−>tf−>esp = sp;
switchuvm(curproc);
freevm(oldpgdir);
return 0;
bad:
if (pgdir)
freevm(pgdir);
if (ip) {
iunlockput(ip);
end_op();
}
return −1;
}
struct inode* namei(char *path) {
char name[DIRSIZ];
return namex(path, 0, name);
}
struct elfhdr {
uint magic; // must equal ELF_MAGIC
uchar elf[12];
ushort type;
ushort machine;
uint version;
uint entry;
uint phoff;
uint shoff;
uint flags;
ushort ehsize;
ushort phentsize;
ushort phnum;
ushort shentsize;
ushort shnum;
ushort shstrndx;
};
// Program section header
struct proghdr {
uint type;
uint off;
uint vaddr;
uint paddr;
uint filesz;
uint memsz;
uint flags;
uint align;
};