Chapter 2 Page Tables

四資工三乙 B10415010 張竣東


Page Tables為一種透過作業系統控制記憶體的架構,讓眾多程序能在記憶體上執行,並保護程序不被另一個程序所干涉,除此之外也用於安排記憶體空間已供程序使用。

Paging hardware


X86的Page Table2的10次方Page Table Entry(以PTE做簡稱)所組成,而PTE的架構由20bitsPhysical Page Number(以PPN做簡稱)與一些Flag所構成(如下圖)。

以下介紹幾個重要的FLAG

  1. PTE_P 用來表示是否存在
  2. PTE_W 用來表示是否能寫入
  3. PTE_U 用來表示是否用戶端的程序能使用

在實體記憶體中Page Table有兩層的結構,第一層為Page Directory1024個PTE所構成(共4096Byte),第二層為Page Table同樣由1024個PTE所構成。

當讀取Virtual Address時,我們會將前10個bits用來選擇第一層的Page Directory並透過第一層的PTE找到對應的Page Table位置,並利用接續的10個bits找到第二層的PTE的PPN,把PPN加上剩餘的12個bits即為Virtual Address所對應的實體位置。

The Code of Page Table Structure

// A virtual address 'la' has a three-part structure as follows: // // +--------10------+-------10-------+---------12----------+ // | Page Directory | Page Table | Offset within Page | // | Index | Index | | // +----------------+----------------+---------------------+ // \--- PDX(va) --/ \--- PTX(va) --/ // page directory index #define PDX(va) (((uint)(va) >> PDXSHIFT) & 0x3FF) // page table index #define PTX(va) (((uint)(va) >> PTXSHIFT) & 0x3FF) // construct virtual address from indexes and offset #define PGADDR(d, t, o) ((uint)((d) << PDXSHIFT | (t) << PTXSHIFT | (o))) // Page directory and page table constants. #define NPDENTRIES 1024 // # directory entries per page directory #define NPTENTRIES 1024 // # PTEs per page table #define PGSIZE 4096 // bytes mapped by a page #define PGSHIFT 12 // log2(PGSIZE) #define PTXSHIFT 12 // offset of PTX in a linear address #define PDXSHIFT 22 // offset of PDX in a linear address #define PGROUNDUP(sz) (((sz)+PGSIZE-1) & ~(PGSIZE-1)) #define PGROUNDDOWN(a) (((a)) & ~(PGSIZE-1)) // Page table/directory entry flags. #define PTE_P 0x001 // Present #define PTE_W 0x002 // Writeable #define PTE_U 0x004 // User #define PTE_PWT 0x008 // Write-Through #define PTE_PCD 0x010 // Cache-Disable #define PTE_A 0x020 // Accessed #define PTE_D 0x040 // Dirty #define PTE_PS 0x080 // Page Size #define PTE_MBZ 0x180 // Bits must be zero // Address in page table or page directory entry #define PTE_ADDR(pte) ((uint)(pte) & ~0xFFF) #define PTE_FLAGS(pte) ((uint)(pte) & 0xFFF)

Process Address Space


每個Process都擁有各自的Page Table,並使Vitrual Address 0作為程式起始位置,並最高可定址至KERNBASE(最大為2GB)。

虛擬位置的KENRBASE到KENRBASE+PHYSTOP所對應的實體位置為0到PHYSTOP,以便Kernal方便的處理實體位置的內容。

每項程序都能擁有連續的虛擬記憶體,但在實體位置上能以不連續的記憶體空間呈現。

The Code of Memory Layout Define

// memlayout.h #define EXTMEM 0x100000 // Start of extended memory #define PHYSTOP 0xE000000 // Top physical memory #define DEVSPACE 0xFE000000 // Other devices are at high addresses // Key addresses for address space layout (see kmap in vm.c for layout) #define KERNBASE 0x80000000 // First kernel virtual address #define KERNLINK (KERNBASE+EXTMEM) // Address where kernel is linked #define V2P(a) (((uint) (a)) ? KERNBASE) #define P2V(a) (((void *) (a)) + KERNBASE) #define V2P_WO(x) ((x) ? KERNBASE) // same as V2P, but without casts #define P2V_WO(x) ((x) + KERNBASE) // same as P2V, but without casts

The Code For Creating An Address Space

首先main呼叫kvmalloc(),而在該函式內先呼叫setupkvm()並設置道kpgdir以表示目前的Page Table,再透過switchkvm()改變%cr3暫存器。

接下來,mappages()會將Page Table的虛擬位址賦予相對應的實體位址,之後呼叫walkpgdir()來找虛擬位址的PTE並設置合適的FLAGS。

// vm.c // Set up kernel part of a page table. pde_t* setupkvm(void) { pde_t *pgdir; struct kmap *k; if((pgdir = (pde_t*)kalloc()) == 0) return 0; memset(pgdir, 0, PGSIZE); if (P2V(PHYSTOP) > (void*)DEVSPACE) panic("PHYSTOP too high"); for(k = kmap; k < &kmap[NELEM(kmap)]; k++) if(mappages(pgdir, k->virt, k->phys_end - k->phys_start, (uint)k->phys_start, k->perm) < 0) { freevm(pgdir); return 0; } return pgdir; } // Allocate one page table for the machine for the kernel address // space for scheduler processes. void kvmalloc(void) { kpgdir = setupkvm(); switchkvm(); } // Switch h/w page table register to the kernel-only page table, // for when no process is running. void switchkvm(void) { lcr3(V2P(kpgdir)); // switch to the kernel page table } // x86.h lcr3(uint val) { asm volatile("movl %0,%%cr3" : : "r" (val)); } // vm.c // Return the address of the PTE in page table pgdir // that corresponds to virtual address va. If alloc!=0, // create any required page table pages. static pte_t * walkpgdir(pde_t *pgdir, const void *va, int alloc) { pde_t *pde; pte_t *pgtab; pde = &pgdir[PDX(va)]; if(*pde & PTE_P){ pgtab = (pte_t*)P2V(PTE_ADDR(*pde)); } else { if(!alloc || (pgtab = (pte_t*)kalloc()) == 0) return 0; // Make sure all those PTE_P bits are zero. memset(pgtab, 0, PGSIZE); // The permissions here are overly generous, but they can // be further restricted by the permissions in the page table // entries, if necessary. *pde = V2P(pgtab) | PTE_P | PTE_W | PTE_U; } return &pgtab[PTX(va)]; } // Create PTEs for virtual addresses starting at va that refer to // physical addresses starting at pa. va and size might not // be page-aligned. static int mappages(pde_t *pgdir, void *va, uint size, uint pa, int perm) { char *a, *last; pte_t *pte; a = (char*)PGROUNDDOWN((uint)va); last = (char*)PGROUNDDOWN(((uint)va) + size - 1); for(;;){ if((pte = walkpgdir(pgdir, a, 1)) == 0) return -1; if(*pte & PTE_P) panic("remap"); *pte = pa | perm | PTE_P; if(a == last) break; a += PGSIZE; pa += PGSIZE; } return 0; }

Physical Memory Allocation


xv6對於記憶體的設置與釋放皆為一次4096Byte為一個單位(即為一個page),透過Linking List連結未使用的page,設置page時,將該page移出連結,反之移入。

The Code For Physical Memory Allocator

main呼叫kinit1()與kinit2()來初始化allocator,兩者皆透過freerange()來設置記憶體資源,freerang()再呼叫kfree()將記憶體放入Linking List中。

// kalloc.c struct run { struct run *next; }; // Initialization happens in two phases. // 1. main() calls kinit1() while still using entrypgdir to place just // the pages mapped by entrypgdir on free list. // 2. main() calls kinit2() with the rest of the physical pages // after installing a full page table that maps them on all cores. void kinit1(void *vstart, void *vend) { initlock(&kmem.lock, "kmem"); kmem.use_lock = 0; freerange(vstart, vend); } void kinit2(void *vstart, void *vend) { freerange(vstart, vend); kmem.use_lock = 1; } void freerange(void *vstart, void *vend) { char *p; p = (char*)PGROUNDUP((uint)vstart); for(; p + PGSIZE <= (char*)vend; p += PGSIZE) kfree(p); } //PAGEBREAK: 21 // Free the page of physical memory pointed at by v, // which normally should have been returned by a // call to kalloc(). (The exception is when // initializing the allocator; see kinit above.) void kfree(char *v) { struct run *r; if((uint)v % PGSIZE || v < end || V2P(v) >= PHYSTOP) panic("kfree"); // Fill with junk to catch dangling refs. memset(v, 1, PGSIZE); if(kmem.use_lock) acquire(&kmem.lock); r = (struct run*)v; r->next = kmem.freelist; kmem.freelist = r; if(kmem.use_lock) release(&kmem.lock); } // Allocate one 4096-byte page of physical memory. // Returns a pointer that the kernel can use. // Returns 0 if the memory cannot be allocated. char* kalloc(void) { struct run *r; if(kmem.use_lock) acquire(&kmem.lock); r = kmem.freelist; if(r) kmem.freelist = r->next; if(kmem.use_lock) release(&kmem.lock); return (char*)r; }

User part of an Address space


每個程序的Virtual Address起始位置皆從0開始,並放入該程序的程式碼,並利用StackHeap來增加使用的空間。

Gurad Page本身並沒有相對應的位置,若不小心參考到該位址時則會回傳錯誤訊息。

The Code For Heap Expend : sbrk


透過sys_sbrk()呼叫growproc()來控制Heap所擁有的空間,當要增加空間時,growproc()呼叫allocuvm(),減少則呼叫deallocuvm(),最後再呼叫switchuvm()更新程序的狀態。

// syscall.c // Fetch the nth 32-bit system call argument. int argint(int n, int *ip) { return fetchint((myproc()->tf->esp) + 4 + 4*n, ip); } // sysproc.c int sys_sbrk(void) { int addr; int n; if(argint(0, &n) < 0) return -1; addr = myproc()->sz; if(growproc(n) < 0) return -1; return addr; } // proc.h // Per-process state struct proc { uint sz; // Size of process memory (bytes) pde_t* pgdir; // Page table char *kstack; // Bottom of kernel stack for this process enum procstate state; // Process state int pid; // Process ID struct proc *parent; // Parent process struct trapframe *tf; // Trap frame for current syscall struct context *context; // swtch() here to run process void *chan; // If non-zero, sleeping on chan int killed; // If non-zero, have been killed struct file *ofile[NOFILE]; // Open files struct inode *cwd; // Current directory char name[16]; // Process name (debugging) }; // proc.c // Disable interrupts so that we are not rescheduled // while reading proc from the cpu structure struct proc* myproc(void) { struct cpu *c; struct proc *p; pushcli(); c = mycpu(); p = c->proc; popcli(); return p; } // Grow current process's memory by n bytes. // Return 0 on success, -1 on failure. int growproc(int n) { uint sz; struct proc *curproc = myproc(); sz = curproc->sz; if(n > 0){ if((sz = allocuvm(curproc->pgdir, sz, sz + n)) == 0) return -1; } else if(n < 0){ if((sz = deallocuvm(curproc->pgdir, sz, sz + n)) == 0) return -1; } curproc->sz = sz; switchuvm(curproc); return 0; } // vm.c // Allocate page tables and physical memory to grow process from oldsz to // newsz, which need not be page aligned. Returns new size or 0 on error. int allocuvm(pde_t *pgdir, uint oldsz, uint newsz) { char *mem; uint a; if(newsz >= KERNBASE) return 0; if(newsz < oldsz) return oldsz; a = PGROUNDUP(oldsz); for(; a < newsz; a += PGSIZE){ mem = kalloc(); if(mem == 0){ cprintf("allocuvm out of memory\n"); deallocuvm(pgdir, newsz, oldsz); return 0; } memset(mem, 0, PGSIZE); if(mappages(pgdir, (char*)a, PGSIZE, V2P(mem), PTE_W|PTE_U) < 0){ cprintf("allocuvm out of memory (2)\n"); deallocuvm(pgdir, newsz, oldsz); kfree(mem); return 0; } } return newsz; } // Deallocate user pages to bring the process size from oldsz to // newsz. oldsz and newsz need not be page-aligned, nor does newsz // need to be less than oldsz. oldsz can be larger than the actual // process size. Returns the new process size. int deallocuvm(pde_t *pgdir, uint oldsz, uint newsz) { pte_t *pte; uint a, pa; if(newsz >= oldsz) return oldsz; a = PGROUNDUP(newsz); for(; a < oldsz; a += PGSIZE){ pte = walkpgdir(pgdir, (char*)a, 0); if(!pte) a = PGADDR(PDX(a) + 1, 0, 0) - PGSIZE; else if((*pte & PTE_P) != 0){ pa = PTE_ADDR(*pte); if(pa == 0) panic("kfree"); char *v = P2V(pa); kfree(v); *pte = 0; } } return newsz; } // Switch TSS and h/w page table to correspond to process p. void switchuvm(struct proc *p) { if(p == 0) panic("switchuvm: no process"); if(p->kstack == 0) panic("switchuvm: no kstack"); if(p->pgdir == 0) panic("switchuvm: no pgdir"); pushcli(); mycpu()->gdt[SEG_TSS] = SEG16(STS_T32A, &mycpu()->ts, sizeof(mycpu()->ts)-1, 0); mycpu()->gdt[SEG_TSS].s = 0; mycpu()->ts.ss0 = SEG_KDATA << 3; mycpu()->ts.esp0 = (uint)p->kstack + KSTACKSIZE; // setting IOPL=0 in eflags *and* iomb beyond the tss segment limit // forbids I/O instructions (e.g., inb and outb) from user space mycpu()->ts.iomb = (ushort) 0xFFFF; ltr(SEG_TSS << 3); lcr3(V2P(p->pgdir)); // switch to process's address space popcli(); }

The Code For Process Executing : exec


透過sys_exec()得知的參數提供給exec(),exec()再利用readi()和elf.magic檢查是否可以讀取與執行,接著呼叫setupkvm()建立page,透過loaduvm()將程序寫入Page Table,再按照格式把argv等相關資料填入Stack中,最後exec()將程序名稱以safestrcpy()進行儲存,以供偵錯使用。

// sysfile.c int sys_exec(void) { char *path, *argv[MAXARG]; int i; uint uargv, uarg; if(argstr(0, &path) < 0 || argint(1, (int*)&uargv) < 0){ return -1; } memset(argv, 0, sizeof(argv)); for(i=0;; i++){ if(i >= NELEM(argv)) return -1; if(fetchint(uargv+4*i, (int*)&uarg) < 0) return -1; if(uarg == 0){ argv[i] = 0; break; } if(fetchstr(uarg, &argv[i]) < 0) return -1; } return exec(path, argv); } // fs.c struct inode* namei(char *path) { char name[DIRSIZ]; return namex(path, 0, name); } //PAGEBREAK! // Read data from inode. // Caller must hold ip->lock. int readi(struct inode *ip, char *dst, uint off, uint n) { uint tot, m; struct buf *bp; if(ip->type == T_DEV){ if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].read) return -1; return devsw[ip->major].read(ip, dst, n); } if(off > ip->size || off + n < off) return -1; if(off + n > ip->size) n = ip->size - off; for(tot=0; tot<n; tot+=m, off+=m, dst+=m){ bp = bread(ip->dev, bmap(ip, off/BSIZE)); m = min(n - tot, BSIZE - off%BSIZE); memmove(dst, bp->data + off%BSIZE, m); brelse(bp); } return n; } // elf.h // Format of an ELF executable file #define ELF_MAGIC 0x464C457FU // "\x7FELF" in little endian // File header struct elfhdr { uint magic; // must equal ELF_MAGIC uchar elf[12]; ushort type; ushort machine; uint version; uint entry; uint phoff; uint shoff; uint flags; ushort ehsize; ushort phentsize; ushort phnum; ushort shentsize; ushort shnum; ushort shstrndx; }; // Program section header struct proghdr { uint type; uint off; uint vaddr; uint paddr; uint filesz; uint memsz; uint flags; uint align; }; // Values for Proghdr type #define ELF_PROG_LOAD 1 // Flag bits for Proghdr flags #define ELF_PROG_FLAG_EXEC 1 #define ELF_PROG_FLAG_WRITE 2 #define ELF_PROG_FLAG_READ 4 // exec.c int exec(char *path, char **argv) { char *s, *last; int i, off; uint argc, sz, sp, ustack[3+MAXARG+1]; struct elfhdr elf; struct inode *ip; struct proghdr ph; pde_t *pgdir, *oldpgdir; struct proc *curproc = myproc(); begin_op(); if((ip = namei(path)) == 0){ end_op(); cprintf("exec: fail\n"); return -1; } ilock(ip); pgdir = 0; // Check ELF header if(readi(ip, (char*)&elf, 0, sizeof(elf)) != sizeof(elf)) goto bad; if(elf.magic != ELF_MAGIC) goto bad; if((pgdir = setupkvm()) == 0) goto bad; // Load program into memory. sz = 0; for(i=0, off=elf.phoff; i<elf.phnum; i++, off+=sizeof(ph)){ if(readi(ip, (char*)&ph, off, sizeof(ph)) != sizeof(ph)) goto bad; if(ph.type != ELF_PROG_LOAD) continue; if(ph.memsz < ph.filesz) goto bad; if(ph.vaddr + ph.memsz < ph.vaddr) goto bad; if((sz = allocuvm(pgdir, sz, ph.vaddr + ph.memsz)) == 0) goto bad; if(ph.vaddr % PGSIZE != 0) goto bad; if(loaduvm(pgdir, (char*)ph.vaddr, ip, ph.off, ph.filesz) < 0) goto bad; } iunlockput(ip); end_op(); ip = 0; // Allocate two pages at the next page boundary. // Make the first inaccessible. Use the second as the user stack. sz = PGROUNDUP(sz); if((sz = allocuvm(pgdir, sz, sz + 2*PGSIZE)) == 0) goto bad; clearpteu(pgdir, (char*)(sz - 2*PGSIZE)); sp = sz; // Push argument strings, prepare rest of stack in ustack. for(argc = 0; argv[argc]; argc++) { if(argc >= MAXARG) goto bad; sp = (sp - (strlen(argv[argc]) + 1)) & ~3; if(copyout(pgdir, sp, argv[argc], strlen(argv[argc]) + 1) < 0) goto bad; ustack[3+argc] = sp; } ustack[3+argc] = 0; ustack[0] = 0xffffffff; // fake return PC ustack[1] = argc; ustack[2] = sp - (argc+1)*4; // argv pointer sp -= (3+argc+1) * 4; if(copyout(pgdir, sp, ustack, (3+argc+1)*4) < 0) goto bad; // Save program name for debugging. for(last=s=path; *s; s++) if(*s == '/') last = s+1; safestrcpy(curproc->name, last, sizeof(curproc->name)); // Commit to the user image. oldpgdir = curproc->pgdir; curproc->pgdir = pgdir; curproc->sz = sz; curproc->tf->eip = elf.entry; // main curproc->tf->esp = sp; switchuvm(curproc); freevm(oldpgdir); return 0; bad: if(pgdir) freevm(pgdir); if(ip){ iunlockput(ip); end_op(); } return -1; } // vm.c // Load a program segment into pgdir. addr must be page-aligned // and the pages from addr to addr+sz must already be mapped. int loaduvm(pde_t *pgdir, char *addr, struct inode *ip, uint offset, uint sz) { uint i, pa, n; pte_t *pte; if((uint) addr % PGSIZE != 0) panic("loaduvm: addr must be page aligned"); for(i = 0; i < sz; i += PGSIZE){ if((pte = walkpgdir(pgdir, addr+i, 0)) == 0) panic("loaduvm: address should exist"); pa = PTE_ADDR(*pte); if(sz - i < PGSIZE) n = sz - i; else n = PGSIZE; if(readi(ip, P2V(pa), offset+i, n) != n) return -1; } return 0; } // string.c // Like strncpy but guaranteed to NUL-terminate. char* safestrcpy(char *s, const char *t, int n) { char *os; os = s; if(n <= 0) return os; while(--n > 0 && (*s++ = *t++) != 0) ; *s = 0; return os; }