四資工三乙 B10415010 張竣東
Page Tables為一種透過作業系統控制記憶體的架構,讓眾多程序能在記憶體上執行,並保護程序不被另一個程序所干涉,除此之外也用於安排記憶體空間已供程序使用。
X86的Page Table由2的10次方個Page Table Entry(以PTE做簡稱)所組成,而PTE的架構由20bits的Physical Page Number(以PPN做簡稱)與一些Flag所構成(如下圖)。
以下介紹幾個重要的FLAG
在實體記憶體中Page Table有兩層的結構,第一層為Page Directory由1024個PTE所構成(共4096Byte),第二層為Page Table同樣由1024個PTE所構成。
當讀取Virtual Address時,我們會將前10個bits用來選擇第一層的Page Directory並透過第一層的PTE找到對應的Page Table位置,並利用接續的10個bits找到第二層的PTE的PPN,把PPN加上剩餘的12個bits即為Virtual Address所對應的實體位置。
// A virtual address 'la' has a three-part structure as follows:
//
// +--------10------+-------10-------+---------12----------+
// | Page Directory | Page Table | Offset within Page |
// | Index | Index | |
// +----------------+----------------+---------------------+
// \--- PDX(va) --/ \--- PTX(va) --/
// page directory index
#define PDX(va) (((uint)(va) >> PDXSHIFT) & 0x3FF)
// page table index
#define PTX(va) (((uint)(va) >> PTXSHIFT) & 0x3FF)
// construct virtual address from indexes and offset
#define PGADDR(d, t, o) ((uint)((d) << PDXSHIFT | (t) << PTXSHIFT | (o)))
// Page directory and page table constants.
#define NPDENTRIES 1024 // # directory entries per page directory
#define NPTENTRIES 1024 // # PTEs per page table
#define PGSIZE 4096 // bytes mapped by a page
#define PGSHIFT 12 // log2(PGSIZE)
#define PTXSHIFT 12 // offset of PTX in a linear address
#define PDXSHIFT 22 // offset of PDX in a linear address
#define PGROUNDUP(sz) (((sz)+PGSIZE-1) & ~(PGSIZE-1))
#define PGROUNDDOWN(a) (((a)) & ~(PGSIZE-1))
// Page table/directory entry flags.
#define PTE_P 0x001 // Present
#define PTE_W 0x002 // Writeable
#define PTE_U 0x004 // User
#define PTE_PWT 0x008 // Write-Through
#define PTE_PCD 0x010 // Cache-Disable
#define PTE_A 0x020 // Accessed
#define PTE_D 0x040 // Dirty
#define PTE_PS 0x080 // Page Size
#define PTE_MBZ 0x180 // Bits must be zero
// Address in page table or page directory entry
#define PTE_ADDR(pte) ((uint)(pte) & ~0xFFF)
#define PTE_FLAGS(pte) ((uint)(pte) & 0xFFF)
每個Process都擁有各自的Page Table,並使Vitrual Address 0作為程式起始位置,並最高可定址至KERNBASE(最大為2GB)。
虛擬位置的KENRBASE到KENRBASE+PHYSTOP所對應的實體位置為0到PHYSTOP,以便Kernal方便的處理實體位置的內容。
每項程序都能擁有連續的虛擬記憶體,但在實體位置上能以不連續的記憶體空間呈現。
// memlayout.h
#define EXTMEM 0x100000 // Start of extended memory
#define PHYSTOP 0xE000000 // Top physical memory
#define DEVSPACE 0xFE000000 // Other devices are at high addresses
// Key addresses for address space layout (see kmap in vm.c for layout)
#define KERNBASE 0x80000000 // First kernel virtual address
#define KERNLINK (KERNBASE+EXTMEM) // Address where kernel is linked
#define V2P(a) (((uint) (a)) ? KERNBASE)
#define P2V(a) (((void *) (a)) + KERNBASE)
#define V2P_WO(x) ((x) ? KERNBASE) // same as V2P, but without casts
#define P2V_WO(x) ((x) + KERNBASE) // same as P2V, but without casts
首先main呼叫kvmalloc(),而在該函式內先呼叫setupkvm()並設置道kpgdir以表示目前的Page Table,再透過switchkvm()改變%cr3暫存器。
接下來,mappages()會將Page Table的虛擬位址賦予相對應的實體位址,之後呼叫walkpgdir()來找虛擬位址的PTE並設置合適的FLAGS。
// vm.c
// Set up kernel part of a page table.
pde_t*
setupkvm(void)
{
pde_t *pgdir;
struct kmap *k;
if((pgdir = (pde_t*)kalloc()) == 0)
return 0;
memset(pgdir, 0, PGSIZE);
if (P2V(PHYSTOP) > (void*)DEVSPACE)
panic("PHYSTOP too high");
for(k = kmap; k < &kmap[NELEM(kmap)]; k++)
if(mappages(pgdir, k->virt, k->phys_end - k->phys_start,
(uint)k->phys_start, k->perm) < 0) {
freevm(pgdir);
return 0;
}
return pgdir;
}
// Allocate one page table for the machine for the kernel address
// space for scheduler processes.
void
kvmalloc(void)
{
kpgdir = setupkvm();
switchkvm();
}
// Switch h/w page table register to the kernel-only page table,
// for when no process is running.
void
switchkvm(void)
{
lcr3(V2P(kpgdir)); // switch to the kernel page table
}
// x86.h
lcr3(uint val)
{
asm volatile("movl %0,%%cr3" : : "r" (val));
}
// vm.c
// Return the address of the PTE in page table pgdir
// that corresponds to virtual address va. If alloc!=0,
// create any required page table pages.
static pte_t *
walkpgdir(pde_t *pgdir, const void *va, int alloc)
{
pde_t *pde;
pte_t *pgtab;
pde = &pgdir[PDX(va)];
if(*pde & PTE_P){
pgtab = (pte_t*)P2V(PTE_ADDR(*pde));
} else {
if(!alloc || (pgtab = (pte_t*)kalloc()) == 0)
return 0;
// Make sure all those PTE_P bits are zero.
memset(pgtab, 0, PGSIZE);
// The permissions here are overly generous, but they can
// be further restricted by the permissions in the page table
// entries, if necessary.
*pde = V2P(pgtab) | PTE_P | PTE_W | PTE_U;
}
return &pgtab[PTX(va)];
}
// Create PTEs for virtual addresses starting at va that refer to
// physical addresses starting at pa. va and size might not
// be page-aligned.
static int
mappages(pde_t *pgdir, void *va, uint size, uint pa, int perm)
{
char *a, *last;
pte_t *pte;
a = (char*)PGROUNDDOWN((uint)va);
last = (char*)PGROUNDDOWN(((uint)va) + size - 1);
for(;;){
if((pte = walkpgdir(pgdir, a, 1)) == 0)
return -1;
if(*pte & PTE_P)
panic("remap");
*pte = pa | perm | PTE_P;
if(a == last)
break;
a += PGSIZE;
pa += PGSIZE;
}
return 0;
}
xv6對於記憶體的設置與釋放皆為一次4096Byte為一個單位(即為一個page),透過Linking List連結未使用的page,設置page時,將該page移出連結,反之移入。
main呼叫kinit1()與kinit2()來初始化allocator,兩者皆透過freerange()來設置記憶體資源,freerang()再呼叫kfree()將記憶體放入Linking List中。
// kalloc.c
struct run {
struct run *next;
};
// Initialization happens in two phases.
// 1. main() calls kinit1() while still using entrypgdir to place just
// the pages mapped by entrypgdir on free list.
// 2. main() calls kinit2() with the rest of the physical pages
// after installing a full page table that maps them on all cores.
void
kinit1(void *vstart, void *vend)
{
initlock(&kmem.lock, "kmem");
kmem.use_lock = 0;
freerange(vstart, vend);
}
void
kinit2(void *vstart, void *vend)
{
freerange(vstart, vend);
kmem.use_lock = 1;
}
void
freerange(void *vstart, void *vend)
{
char *p;
p = (char*)PGROUNDUP((uint)vstart);
for(; p + PGSIZE <= (char*)vend; p += PGSIZE)
kfree(p);
}
//PAGEBREAK: 21
// Free the page of physical memory pointed at by v,
// which normally should have been returned by a
// call to kalloc(). (The exception is when
// initializing the allocator; see kinit above.)
void
kfree(char *v)
{
struct run *r;
if((uint)v % PGSIZE || v < end || V2P(v) >= PHYSTOP)
panic("kfree");
// Fill with junk to catch dangling refs.
memset(v, 1, PGSIZE);
if(kmem.use_lock)
acquire(&kmem.lock);
r = (struct run*)v;
r->next = kmem.freelist;
kmem.freelist = r;
if(kmem.use_lock)
release(&kmem.lock);
}
// Allocate one 4096-byte page of physical memory.
// Returns a pointer that the kernel can use.
// Returns 0 if the memory cannot be allocated.
char*
kalloc(void)
{
struct run *r;
if(kmem.use_lock)
acquire(&kmem.lock);
r = kmem.freelist;
if(r)
kmem.freelist = r->next;
if(kmem.use_lock)
release(&kmem.lock);
return (char*)r;
}
每個程序的Virtual Address起始位置皆從0開始,並放入該程序的程式碼,並利用Stack與Heap來增加使用的空間。
Gurad Page本身並沒有相對應的位置,若不小心參考到該位址時則會回傳錯誤訊息。
透過sys_sbrk()呼叫growproc()來控制Heap所擁有的空間,當要增加空間時,growproc()呼叫allocuvm(),減少則呼叫deallocuvm(),最後再呼叫switchuvm()更新程序的狀態。
// syscall.c
// Fetch the nth 32-bit system call argument.
int
argint(int n, int *ip)
{
return fetchint((myproc()->tf->esp) + 4 + 4*n, ip);
}
// sysproc.c
int
sys_sbrk(void)
{
int addr;
int n;
if(argint(0, &n) < 0)
return -1;
addr = myproc()->sz;
if(growproc(n) < 0)
return -1;
return addr;
}
// proc.h
// Per-process state
struct proc {
uint sz; // Size of process memory (bytes)
pde_t* pgdir; // Page table
char *kstack; // Bottom of kernel stack for this process
enum procstate state; // Process state
int pid; // Process ID
struct proc *parent; // Parent process
struct trapframe *tf; // Trap frame for current syscall
struct context *context; // swtch() here to run process
void *chan; // If non-zero, sleeping on chan
int killed; // If non-zero, have been killed
struct file *ofile[NOFILE]; // Open files
struct inode *cwd; // Current directory
char name[16]; // Process name (debugging)
};
// proc.c
// Disable interrupts so that we are not rescheduled
// while reading proc from the cpu structure
struct proc*
myproc(void) {
struct cpu *c;
struct proc *p;
pushcli();
c = mycpu();
p = c->proc;
popcli();
return p;
}
// Grow current process's memory by n bytes.
// Return 0 on success, -1 on failure.
int
growproc(int n)
{
uint sz;
struct proc *curproc = myproc();
sz = curproc->sz;
if(n > 0){
if((sz = allocuvm(curproc->pgdir, sz, sz + n)) == 0)
return -1;
} else if(n < 0){
if((sz = deallocuvm(curproc->pgdir, sz, sz + n)) == 0)
return -1;
}
curproc->sz = sz;
switchuvm(curproc);
return 0;
}
// vm.c
// Allocate page tables and physical memory to grow process from oldsz to
// newsz, which need not be page aligned. Returns new size or 0 on error.
int
allocuvm(pde_t *pgdir, uint oldsz, uint newsz)
{
char *mem;
uint a;
if(newsz >= KERNBASE)
return 0;
if(newsz < oldsz)
return oldsz;
a = PGROUNDUP(oldsz);
for(; a < newsz; a += PGSIZE){
mem = kalloc();
if(mem == 0){
cprintf("allocuvm out of memory\n");
deallocuvm(pgdir, newsz, oldsz);
return 0;
}
memset(mem, 0, PGSIZE);
if(mappages(pgdir, (char*)a, PGSIZE, V2P(mem), PTE_W|PTE_U) < 0){
cprintf("allocuvm out of memory (2)\n");
deallocuvm(pgdir, newsz, oldsz);
kfree(mem);
return 0;
}
}
return newsz;
}
// Deallocate user pages to bring the process size from oldsz to
// newsz. oldsz and newsz need not be page-aligned, nor does newsz
// need to be less than oldsz. oldsz can be larger than the actual
// process size. Returns the new process size.
int
deallocuvm(pde_t *pgdir, uint oldsz, uint newsz)
{
pte_t *pte;
uint a, pa;
if(newsz >= oldsz)
return oldsz;
a = PGROUNDUP(newsz);
for(; a < oldsz; a += PGSIZE){
pte = walkpgdir(pgdir, (char*)a, 0);
if(!pte)
a = PGADDR(PDX(a) + 1, 0, 0) - PGSIZE;
else if((*pte & PTE_P) != 0){
pa = PTE_ADDR(*pte);
if(pa == 0)
panic("kfree");
char *v = P2V(pa);
kfree(v);
*pte = 0;
}
}
return newsz;
}
// Switch TSS and h/w page table to correspond to process p.
void
switchuvm(struct proc *p)
{
if(p == 0)
panic("switchuvm: no process");
if(p->kstack == 0)
panic("switchuvm: no kstack");
if(p->pgdir == 0)
panic("switchuvm: no pgdir");
pushcli();
mycpu()->gdt[SEG_TSS] = SEG16(STS_T32A, &mycpu()->ts,
sizeof(mycpu()->ts)-1, 0);
mycpu()->gdt[SEG_TSS].s = 0;
mycpu()->ts.ss0 = SEG_KDATA << 3;
mycpu()->ts.esp0 = (uint)p->kstack + KSTACKSIZE;
// setting IOPL=0 in eflags *and* iomb beyond the tss segment limit
// forbids I/O instructions (e.g., inb and outb) from user space
mycpu()->ts.iomb = (ushort) 0xFFFF;
ltr(SEG_TSS << 3);
lcr3(V2P(p->pgdir)); // switch to process's address space
popcli();
}
透過sys_exec()得知的參數提供給exec(),exec()再利用readi()和elf.magic檢查是否可以讀取與執行,接著呼叫setupkvm()建立page,透過loaduvm()將程序寫入Page Table,再按照格式把argv等相關資料填入Stack中,最後exec()將程序名稱以safestrcpy()進行儲存,以供偵錯使用。
// sysfile.c
int
sys_exec(void)
{
char *path, *argv[MAXARG];
int i;
uint uargv, uarg;
if(argstr(0, &path) < 0 || argint(1, (int*)&uargv) < 0){
return -1;
}
memset(argv, 0, sizeof(argv));
for(i=0;; i++){
if(i >= NELEM(argv))
return -1;
if(fetchint(uargv+4*i, (int*)&uarg) < 0)
return -1;
if(uarg == 0){
argv[i] = 0;
break;
}
if(fetchstr(uarg, &argv[i]) < 0)
return -1;
}
return exec(path, argv);
}
// fs.c
struct inode*
namei(char *path)
{
char name[DIRSIZ];
return namex(path, 0, name);
}
//PAGEBREAK!
// Read data from inode.
// Caller must hold ip->lock.
int
readi(struct inode *ip, char *dst, uint off, uint n)
{
uint tot, m;
struct buf *bp;
if(ip->type == T_DEV){
if(ip->major < 0 || ip->major >= NDEV || !devsw[ip->major].read)
return -1;
return devsw[ip->major].read(ip, dst, n);
}
if(off > ip->size || off + n < off)
return -1;
if(off + n > ip->size)
n = ip->size - off;
for(tot=0; tot<n; tot+=m, off+=m, dst+=m){
bp = bread(ip->dev, bmap(ip, off/BSIZE));
m = min(n - tot, BSIZE - off%BSIZE);
memmove(dst, bp->data + off%BSIZE, m);
brelse(bp);
}
return n;
}
// elf.h
// Format of an ELF executable file
#define ELF_MAGIC 0x464C457FU // "\x7FELF" in little endian
// File header
struct elfhdr {
uint magic; // must equal ELF_MAGIC
uchar elf[12];
ushort type;
ushort machine;
uint version;
uint entry;
uint phoff;
uint shoff;
uint flags;
ushort ehsize;
ushort phentsize;
ushort phnum;
ushort shentsize;
ushort shnum;
ushort shstrndx;
};
// Program section header
struct proghdr {
uint type;
uint off;
uint vaddr;
uint paddr;
uint filesz;
uint memsz;
uint flags;
uint align;
};
// Values for Proghdr type
#define ELF_PROG_LOAD 1
// Flag bits for Proghdr flags
#define ELF_PROG_FLAG_EXEC 1
#define ELF_PROG_FLAG_WRITE 2
#define ELF_PROG_FLAG_READ 4
// exec.c
int
exec(char *path, char **argv)
{
char *s, *last;
int i, off;
uint argc, sz, sp, ustack[3+MAXARG+1];
struct elfhdr elf;
struct inode *ip;
struct proghdr ph;
pde_t *pgdir, *oldpgdir;
struct proc *curproc = myproc();
begin_op();
if((ip = namei(path)) == 0){
end_op();
cprintf("exec: fail\n");
return -1;
}
ilock(ip);
pgdir = 0;
// Check ELF header
if(readi(ip, (char*)&elf, 0, sizeof(elf)) != sizeof(elf))
goto bad;
if(elf.magic != ELF_MAGIC)
goto bad;
if((pgdir = setupkvm()) == 0)
goto bad;
// Load program into memory.
sz = 0;
for(i=0, off=elf.phoff; i<elf.phnum; i++, off+=sizeof(ph)){
if(readi(ip, (char*)&ph, off, sizeof(ph)) != sizeof(ph))
goto bad;
if(ph.type != ELF_PROG_LOAD)
continue;
if(ph.memsz < ph.filesz)
goto bad;
if(ph.vaddr + ph.memsz < ph.vaddr)
goto bad;
if((sz = allocuvm(pgdir, sz, ph.vaddr + ph.memsz)) == 0)
goto bad;
if(ph.vaddr % PGSIZE != 0)
goto bad;
if(loaduvm(pgdir, (char*)ph.vaddr, ip, ph.off, ph.filesz) < 0)
goto bad;
}
iunlockput(ip);
end_op();
ip = 0;
// Allocate two pages at the next page boundary.
// Make the first inaccessible. Use the second as the user stack.
sz = PGROUNDUP(sz);
if((sz = allocuvm(pgdir, sz, sz + 2*PGSIZE)) == 0)
goto bad;
clearpteu(pgdir, (char*)(sz - 2*PGSIZE));
sp = sz;
// Push argument strings, prepare rest of stack in ustack.
for(argc = 0; argv[argc]; argc++) {
if(argc >= MAXARG)
goto bad;
sp = (sp - (strlen(argv[argc]) + 1)) & ~3;
if(copyout(pgdir, sp, argv[argc], strlen(argv[argc]) + 1) < 0)
goto bad;
ustack[3+argc] = sp;
}
ustack[3+argc] = 0;
ustack[0] = 0xffffffff; // fake return PC
ustack[1] = argc;
ustack[2] = sp - (argc+1)*4; // argv pointer
sp -= (3+argc+1) * 4;
if(copyout(pgdir, sp, ustack, (3+argc+1)*4) < 0)
goto bad;
// Save program name for debugging.
for(last=s=path; *s; s++)
if(*s == '/')
last = s+1;
safestrcpy(curproc->name, last, sizeof(curproc->name));
// Commit to the user image.
oldpgdir = curproc->pgdir;
curproc->pgdir = pgdir;
curproc->sz = sz;
curproc->tf->eip = elf.entry; // main
curproc->tf->esp = sp;
switchuvm(curproc);
freevm(oldpgdir);
return 0;
bad:
if(pgdir)
freevm(pgdir);
if(ip){
iunlockput(ip);
end_op();
}
return -1;
}
// vm.c
// Load a program segment into pgdir. addr must be page-aligned
// and the pages from addr to addr+sz must already be mapped.
int
loaduvm(pde_t *pgdir, char *addr, struct inode *ip, uint offset, uint sz)
{
uint i, pa, n;
pte_t *pte;
if((uint) addr % PGSIZE != 0)
panic("loaduvm: addr must be page aligned");
for(i = 0; i < sz; i += PGSIZE){
if((pte = walkpgdir(pgdir, addr+i, 0)) == 0)
panic("loaduvm: address should exist");
pa = PTE_ADDR(*pte);
if(sz - i < PGSIZE)
n = sz - i;
else
n = PGSIZE;
if(readi(ip, P2V(pa), offset+i, n) != n)
return -1;
}
return 0;
}
// string.c
// Like strncpy but guaranteed to NUL-terminate.
char*
safestrcpy(char *s, const char *t, int n)
{
char *os;
os = s;
if(n <= 0)
return os;
while(--n > 0 && (*s++ = *t++) != 0)
;
*s = 0;
return os;
}