Chapter 2. Page tables

  xv6使用了Page tables mechanism讓不同process各自的address space可以對應到相同的physical memory,且能保護不同process各自的memory不互相影響。

1. 原理


根據Figure 2-1表示,地址的轉換都會經由兩個步驟。
  有一個4096 bytes的page directory,裡面存有1024個page table pages的reference,而每個page table pages中又有1024個32 bits的page table entry(PTE)。
  首先paging hardware會先使用virtual address的top 10 bits去找到page directory entry。
  如果存在的話,paging hardware就會使用接下來的10 bits去找到PTE,如果page directory entry或PTE不存在的話,那paging hardware就會raise a fault。
  每個PTE都會包含flag bits,如PTE_P代表PTE存在,PTE_W代表是否允許寫入這個page和PTE_U控制user program是否能使用這個page等等。

2. 程式流程

3. 程式碼說明

kvmalloc

1839 void
1840 kvmalloc(void)
1841 {
1842   kpgdir = setupkvm();
1843   switchkvm();
1844 }
  1. 呼叫setupkvm和switchkvm來建立一個page table,並切換page table

setupkvm

1817 pde_t*
1818 setupkvm(void)
1819 {
1820   pde_t *pgdir;
1821   struct kmap *k;
1822 
1823   if((pgdir = (pde_t*)kalloc()) == 0)
1824     return 0;
1825   memset(pgdir, 0, PGSIZE);
1826   if (P2V(PHYSTOP) > (void*)DEVSPACE)
1827     panic("PHYSTOP too high");
1828   for(k = kmap; k < &kmap[NELEM(kmap)]; k++)
1829     if(mappages(pgdir, k−>virt, k−>phys_end − k−>phys_start,
1830                 (uint)k−>phys_start, k−>perm) < 0) {
1831       freevm(pgdir);
1832       return 0;
1833     }
1834   return pgdir;
1835 }

功能為建立一個page table

  1. 呼叫kalloc分配一個virtual addresses space
  2. 呼叫mappages依照kmap建立 virtual addresse和physical addresses的translations
    如果發生錯誤的話,則呼叫freevm歸還已經被allocate的page table,之後return

mappages

1759 static int
1760 mappages(pde_t *pgdir, void *va, uint size, uint pa, int perm)
1761 {
1762   char *a, *last;
1763   pte_t *pte;
1764 
1765   a = (char*)PGROUNDDOWN((uint)va);
1766   last = (char*)PGROUNDDOWN(((uint)va) + size − 1);
1767   for(;;){
1768     if((pte = walkpgdir(pgdir, a, 1)) == 0)
1769       return −1;
1770     if(*pte & PTE_P)
1771       panic("remap");
1772     *pte = pa | perm | PTE_P;
1773     if(a == last)
1774       break;
1775     a += PGSIZE;
1776     pa += PGSIZE;
1777   }
1778   return 0;
1779 }

功能為建立virtual addresse和physical addresses的translations

  1. 對每個已經被mapped的virtual address,呼叫walkpgdir來找到PTE的address.
  2. 初始化PTE的relevant physical page number,PTE_W、PTE_U和PTE_P,並標示PTE_P是有效的

walkpgdir

1734 static pte_t *
1735 walkpgdir(pde_t *pgdir, const void *va, int alloc)
1736 {
1737   pde_t *pde;
1738   pte_t *pgtab;
1739 
1740   pde = &pgdir[PDX(va)];
1741   if(*pde & PTE_P){
1742     pgtab = (pte_t*)P2V(PTE_ADDR(*pde));
1743   } else {
1744     if(!alloc || (pgtab = (pte_t*)kalloc()) == 0)
1745       return 0;
1746     // Make sure all those PTE_P bits are zero.
1747     memset(pgtab, 0, PGSIZE);
1748     // The permissions here are overly generous, but they can
1749     // be further restricted by the permissions in the page table
1750     // entries, if necessary.
1751     *pde = V2P(pgtab) | PTE_P | PTE_W | PTE_U;
1752   }
1753   return &pgtab[PTX(va)];
1754 }

功能為找到在page table中PTE的address

  1. 根據virtual address的top 10 bits找到page directory entry
  2. 如果不存在,就新建一個page table page並設定屬性
  3. 如果存在就根據下10 bits找到PTE的address並回傳

kalloc

3186 char*
3187 kalloc(void)
3188 {
3189   struct run *r;
3190 
3191   if(kmem.use_lock)
3192     acquire(&kmem.lock);
3193   r = kmem.freelist;
3194   if(r)
3195     kmem.freelist = r−>next;
3196   if(kmem.use_lock)
3197     release(&kmem.lock);
3198   return (char*)r;
3199 }

功能為分配一個virtual addresses space

  1. 變數r指向free list的頭
  2. free list = r.next
  3. return r

kinit1和kinit2

3130 void
3131 kinit1(void *vstart, void *vend)
3132 {
3133   initlock(&kmem.lock, "kmem");
3134   kmem.use_lock = 0;
3135   freerange(vstart, vend);
3136 }
3137 
3138 void
3139 kinit2(void *vstart, void *vend)
3140 {
3141   freerange(vstart, vend);
3142   kmem.use_lock = 1;
3143 }

kinit1和kinit2都是在初始化memory,差別只在kinit1是lock-less allocation,用來設定前4 MB,而kinit2 enables locking來安排更多allocatable的memory。

  1. 呼叫freerange來初始化physical memory

freerange

3150 void
3151 freerange(void *vstart, void *vend)
3152 {
3153   char *p;
3154   p = (char*)PGROUNDUP((uint)vstart);
3155   for(; p + PGSIZE <= (char*)vend; p += PGSIZE)
3156     kfree(p);
3157 }

功能為初始化physical memory

  1. 呼叫PGROUNDUP來確保他只free掉對齊的physical addresses
  2. 呼叫kfree來釋放memory

kfree

3163 void
3164 kfree(char *v)
3165 {
3166   struct run *r;
3167 
3168   if((uint)v % PGSIZE || v < end || V2P(v) >= PHYSTOP)
3169     panic("kfree");
3170 
3171   // Fill with junk to catch dangling refs.
3172   memset(v, 1, PGSIZE);
3173 
3174   if(kmem.use_lock)
3175     acquire(&kmem.lock);
3176   r = (struct run*)v;
3177   r−>next = kmem.freelist;
3178   kmem.freelist = r;
3179   if(kmem.use_lock)
3180     release(&kmem.lock);
3181 }

功能為釋放memory

  1. 呼叫memset來把memory初始化為1
  2. 將這塊memory加到free list的頭
  3. 回傳free list的頭

growproc

2557 int
2558 growproc(int n)
2559 {
2560   uint sz;
2561   struct proc *curproc = myproc();
2562 
2563   sz = curproc−>sz;
2564   if(n > 0){
2565     if((sz = allocuvm(curproc−>pgdir, sz, sz + n)) == 0)
2566       return −1;
2567   } else if(n < 0){
2568     if((sz = deallocuvm(curproc−>pgdir, sz, sz + n)) == 0)
2569       return −1;
2570   }
2571   curproc−>sz = sz;
2572   switchuvm(curproc);
2573   return 0;
2574 }

功能為把memory擴充或縮小n bytes

  1. 如果n>0,則呼叫allocuvm將memory的空間從sz擴充到sz+n
  2. 如果n<0,則呼叫deallocuvm將memory的空間從sz縮小到sz+n

exec

6609 int
6610 exec(char *path, char **argv)
6611 {
6612   char *s, *last;
6613   int i, off;
6614   uint argc, sz, sp, ustack[3+MAXARG+1];
6615   struct elfhdr elf;
6616   struct inode *ip;
6617   struct proghdr ph;
6618   pde_t *pgdir, *oldpgdir;
6619   struct proc *curproc = myproc();
6620 
6621   begin_op();
6622 
6623   if((ip = namei(path)) == 0){
6624     end_op();
6625     cprintf("exec: fail\n");
6626     return −1;
6627   }
6628   ilock(ip);
6629   pgdir = 0;
6630 
6631   // Check ELF header
6632   if(readi(ip, (char*)&elf, 0, sizeof(elf)) != sizeof(elf))
6633     goto bad;
6634   if(elf.magic != ELF_MAGIC)
6635     goto bad;
6636 
6637   if((pgdir = setupkvm()) == 0)
6638     goto bad;
6639 
6640   // Load program into memory.
6641   sz = 0;
6642   for(i=0, off=elf.phoff; i<elf.phnum; i++, off+=sizeof(ph)){
6643     if(readi(ip, (char*)&ph, off, sizeof(ph)) != sizeof(ph))
6644       goto bad;
6645     if(ph.type != ELF_PROG_LOAD)
6646       continue;
6647     if(ph.memsz < ph.filesz)
6648       goto bad;
6649     if(ph.vaddr + ph.memsz < ph.vaddr)
6650       goto bad;
6651     if((sz = allocuvm(pgdir, sz, ph.vaddr + ph.memsz)) == 0)
6652       goto bad;
6653     if(ph.vaddr % PGSIZE != 0)
6654       goto bad;
6655     if(loaduvm(pgdir, (char*)ph.vaddr, ip, ph.off, ph.filesz) < 0)
6656       goto bad;
6657   }
6658   iunlockput(ip);
6659   end_op();
6660   ip = 0;
6661 
6662   // Allocate two pages at the next page boundary.
6663   // Make the first inaccessible.  Use the second as the user stack.
6664   sz = PGROUNDUP(sz);
6665   if((sz = allocuvm(pgdir, sz, sz + 2*PGSIZE)) == 0)
6666     goto bad;
6667   clearpteu(pgdir, (char*)(sz − 2*PGSIZE));
6668   sp = sz;
6669 
6670   // Push argument strings, prepare rest of stack in ustack.
6671   for(argc = 0; argv[argc]; argc++) {
6672     if(argc >= MAXARG)
6673       goto bad;
6674     sp = (sp − (strlen(argv[argc]) + 1)) & ~3;
6675     if(copyout(pgdir, sp, argv[argc], strlen(argv[argc]) + 1) < 0)
6676       goto bad;
6677     ustack[3+argc] = sp;
6678   }
6679   ustack[3+argc] = 0;
6680 
6681   ustack[0] = 0xffffffff;  // fake return PC
6682   ustack[1] = argc;
6683   ustack[2] = sp − (argc+1)*4;  // argv pointer
6684 
6685   sp −= (3+argc+1) * 4;
6686   if(copyout(pgdir, sp, ustack, (3+argc+1)*4) < 0)
6687     goto bad;
6688 
6689   // Save program name for debugging.
6690   for(last=s=path; *s; s++)
6691     if(*s == ’/’)
6692       last = s+1;
6693   safestrcpy(curproc−>name, last, sizeof(curproc−>name));
6694 
6695   // Commit to the user image.
6696   oldpgdir = curproc−>pgdir;
6697   curproc−>pgdir = pgdir;
6698   curproc−>sz = sz;
6699   curproc−>tf−>eip = elf.entry;  // main
6700   curproc−>tf−>esp = sp;
6701   switchuvm(curproc);
6702   freevm(oldpgdir);
6703   return 0;
6704 
6705  bad:
6706   if(pgdir)
6707     freevm(pgdir);
6708   if(ip){
6709     iunlockput(ip);
6710     end_op();
6711   }
6712   return −1;
6713 }
  1. 呼叫namei來開啟named binary path,如果失敗就return -1。
  2. 檢查ELF header的開頭4 byte有沒有符合ELF_MAGIC,如果有exec就假設這個binary是對的。
  3. 呼叫setupkvm來allocate一個page table
  4. 分析elf的各個segment
    i. 呼叫allocuvm來幫每個ELF segment allocate一個新的memory
    ii. 呼叫loaduvm來把每個segment讀進memory裡
  5. 呼叫allocuvm來allocate兩個memory
  6. 呼叫clearpteu清除第一個page的PTE_U
  7. allocate一個stack page用來存argument strings並使用ustack來記錄這些pointers,放一個null pointer在ustack的最後,後面會做為argv list傳給main
  8. 設定path
  9. 保存舊的page table
  10. 更新process的page table
  11. 更新process的空間大小
  12. 呼叫switchuvm來切換page table
  13. 呼叫freevm來free掉舊的page table
  14. return 0
    如果過程中有出錯的部分,全都會jump到label bad,這段程式碼會呼叫freevm去free掉新的page table,並return -1

namei

5789 struct inode*
5790 namei(char *path)
5791 {
5792   char name[DIRSIZ];
5793   return namex(path, 0, name);
5794 }

功能為開啟named binary path

allocuvm

1926 int
1927 allocuvm(pde_t *pgdir, uint oldsz, uint newsz)
1928 {
1929   char *mem;
1930   uint a;
1931 
1932   if(newsz >= KERNBASE)
1933     return 0;
1934   if(newsz < oldsz)
1935     return oldsz;
1936 
1937   a = PGROUNDUP(oldsz);
1938   for(; a < newsz; a += PGSIZE){
1939     mem = kalloc();
1940     if(mem == 0){
1941       cprintf("allocuvm out of memory\n");
1942       deallocuvm(pgdir, newsz, oldsz);
1943       return 0;
1944     }
1945     memset(mem, 0, PGSIZE);
1946     if(mappages(pgdir, (char*)a, PGSIZE, V2P(mem), PTE_W|PTE_U) < 0){
1947       cprintf("allocuvm out of memory (2)\n");
1948       deallocuvm(pgdir, newsz, oldsz);
1949       kfree(mem);
1950       return 0;
1951     }
1952   }
1953   return newsz;
1954 }

功能為將memory的空間從oldsz擴充到newsz

  1. 檢查newsize是不是大於KERNBASE,是就直接return 0
  2. 檢查newsize是不是小於oldsize,是就回傳oldsize
  3. 先呼叫kalloc分配一個page(virtual addresses)
  4. 呼叫memset初始化memory為0,
  5. 呼叫mappages來建立physical addresses和virtual addresses的translations
  6. 回傳newsize
    如果中間有error,則歸還已經拿到的page和physical memory後回傳0

loaduvm

1902 int
1903 loaduvm(pde_t *pgdir, char *addr, struct inode *ip, uint offset, uint sz)
1904 {
1905   uint i, pa, n;
1906   pte_t *pte;
1907 
1908   if((uint) addr % PGSIZE != 0)
1909     panic("loaduvm: addr must be page aligned");
1910   for(i = 0; i < sz; i += PGSIZE){
1911     if((pte = walkpgdir(pgdir, addr+i, 0)) == 0)
1912       panic("loaduvm: address should exist");
1913     pa = PTE_ADDR(*pte);
1914     if(sz − i < PGSIZE)
1915       n = sz − i;
1916     else
1917       n = PGSIZE;
1918     if(readi(ip, P2V(pa), offset+i, n) != n)
1919       return −1;
1920   }
1921   return 0;
1922 }

功能為把program segment load到pgdir裡

  1. 先檢查是不是page aligned
  2. 呼叫walkpgdir來找到PTE的address
  3. 用pte來計算physical memory address
  4. 使用readi讀取內容後寫入memory裡

freevm

2002 void
2003 freevm(pde_t *pgdir)
2004 {
2005   uint i;
2006 
2007   if(pgdir == 0)
2008     panic("freevm: no pgdir");
2009   deallocuvm(pgdir, KERNBASE, 0);
2010   for(i = 0; i < NPDENTRIES; i++){
2011     if(pgdir[i] & PTE_P){
2012       char * v = P2V(PTE_ADDR(pgdir[i]));
2013       kfree(v);
2014     }
2015   }
2016   kfree((char*)pgdir);
2017 }

功能為free掉一個page table和virtual memory pages

  1. 檢查page table是否存在
  2. 呼叫deallocuvm來free掉0~ KERNBASE的virtual memory pages
  3. free掉整個page table

4. 資料結構

elf.h

0950 // Format of an ELF executable file
0951 
0952 #define ELF_MAGIC 0x464C457FU  // "\x7FELF" in little endian
0953 
0954 // File header
0955 struct elfhdr {
0956   uint magic;  // must equal ELF_MAGIC
0957   uchar elf[12];
0958   ushort type;//表示文件的類型
0959   ushort machine;//表示執行程式需要的機器架構
0960   uint version;//表示版本
0961   uint entry;//文件的起始點地址
0962   uint phoff;//Program Header Table相對於文件的位置
0963   uint shoff;//Section header table相對於文件的位置
0964   uint flags;//特定CPU標示
0965   ushort ehsize;//表示ELF header文件的大小
0966   ushort phentsize;//Program Header Table一個入口的大小
0967   ushort phnum;//Program Header Table入口的個數
0968   ushort shentsize;//Section header table一個入口的大小
0969   ushort shnum;//Section header table入口的個數
0970   ushort shstrndx;
0971 };
0972 
0973 // Program section header
0974 struct proghdr {
0975   uint type;//表示型態
0976   uint off;//第一個byte在文件中的偏移
0977   uint vaddr;//第一個byte在memory的virtual address
0978   uint paddr;//第一個byte在memory的physical address
0979   uint filesz;//在文件中的長度
0980   uint memsz;//在memory中的長度
0981   uint flags;//1:可執行 2:可寫入 3:可讀取
0982   uint align;//在文件和memory中如何對齊
0983 };
0984 
0985 // Values for Proghdr type
0986 #define ELF_PROG_LOAD           1
0987 
0988 // Flag bits for Proghdr flags
0989 #define ELF_PROG_FLAG_EXEC      1
0990 #define ELF_PROG_FLAG_WRITE     2
0991 #define ELF_PROG_FLAG_READ      4