Since x86 instructions only operate on virtual address, we need the mechanism to translaing hardware is the mechanism that connects these two kind of addresses.
For the figure above, the page tables is stored as two dimensional array. To translate from virtual address. First, the paging hardware use top 10 bits of virtual address to find the page directory entry. Second, the paging hardware use next 10 bits to select PTE from the page table. The top 20 bits of physical address would be the PPN of the PTE, next 12 bits would be the offset of virtual address.
For the detail of flags of PTE in mmu.h
// Page table/directory entry flags.
#define PTE_P 0x001 // Present
#define PTE_W 0x002 // Writeable
#define PTE_U 0x004 // User
#define PTE_PWT 0x008 // Write-Through
#define PTE_PCD 0x010 // Cache-Disable
#define PTE_A 0x020 // Accessed
#define PTE_D 0x040 // Dirty
#define PTE_PS 0x080 // Page Size
#define PTE_MBZ 0x180 // Bits must be zero
We can see xv6’s memory layout in memlayout.h
// Memory layout
#define EXTMEM 0x100000 // Start of extended memory
#define PHYSTOP 0xE000000 // Top physical memory
#define DEVSPACE 0xFE000000 // Other devices are at high addresses
// Key addresses for address space layout (see kmap in vm.c for layout)
#define KERNBASE 0x80000000 // First kernel virtual address
#define KERNLINK (KERNBASE+EXTMEM) // Address where kernel is linked
Which can be drawn as the figure below:
The figure above show that over 2 GB (KERNBASE
= 0x80000000) is belong to kernel address space, and under 2 GB is for user address space.
In a nutshell, the program flow can be represented as the figure below:
First, main
calls kvmalloc
from vm.c
to create new page table in order to replace the original one.
// Allocate one page table for the machine for the kernel address
// space for scheduler processes.
void
kvmalloc(void)
{
kpgdir = setupkvm();
switchkvm();
}
In kvmalloc
, which in fact calls setupkvm
to create page directory according to struct kmap
. In detail, setupkvm
first use kalloc
to allocate a page of pointer to save page directories (PD). In every page directory entry (PDE) points to a page of page table (PT).
// This table defines the kernel's mappings, which are present in
// every process's page table.
static struct kmap {
void *virt;
uint phys_start;
uint phys_end;
int perm;
} kmap[] = {
{ (void*)KERNBASE, 0, EXTMEM, PTE_W}, // I/O space
{ (void*)KERNLINK, V2P(KERNLINK), V2P(data), 0}, // kern text+rodata
{ (void*)data, V2P(data), PHYSTOP, PTE_W}, // kern data+memory
{ (void*)DEVSPACE, DEVSPACE, 0, PTE_W}, // more devices
};
// Set up kernel part of a page table.
pde_t*
setupkvm(void)
{
pde_t *pgdir;
struct kmap *k;
if((pgdir = (pde_t*)kalloc()) == 0)
return 0;
memset(pgdir, 0, PGSIZE);
if (P2V(PHYSTOP) > (void*)DEVSPACE)
panic("PHYSTOP too high");
for(k = kmap; k < &kmap[NELEM(kmap)]; k++)
if(mappages(pgdir, k->virt, k->phys_end - k->phys_start,
(uint)k->phys_start, k->perm) < 0) {
freevm(pgdir);
return 0;
}
return pgdir;
}
During scanning every element in kmap
, setupkvm
calls mappages
to construct the PTEs.
// Create PTEs for virtual addresses starting at va that refer to
// physical addresses starting at pa. va and size might not
// be page-aligned.
static int
mappages(pde_t *pgdir, void *va, uint size, uint pa, int perm)
{
char *a, *last;
pte_t *pte;
a = (char*)PGROUNDDOWN((uint)va);
last = (char*)PGROUNDDOWN(((uint)va) + size - 1);
for(;;){
if((pte = walkpgdir(pgdir, a, 1)) == 0)
return -1;
if(*pte & PTE_P)
panic("remap");
*pte = pa | perm | PTE_P;
if(a == last)
break;
a += PGSIZE;
pa += PGSIZE;
}
return 0;
}
Then, setupkvm
calls walkpgdir
to walk through all PTEs. walkpgdir
use top 10 bits to check if PDE is present. Finally, walkpgdir
use next 10 bits to find the address of the PTE in the page table page.
// Return the address of the PTE in page table pgdir
// that corresponds to virtual address va. If alloc!=0,
// create any required page table pages.
static pte_t *
walkpgdir(pde_t *pgdir, const void *va, int alloc)
{
pde_t *pde;
pte_t *pgtab;
pde = &pgdir[PDX(va)];
if(*pde & PTE_P){
pgtab = (pte_t*)P2V(PTE_ADDR(*pde));
} else {
if(!alloc || (pgtab = (pte_t*)kalloc()) == 0)
return 0;
// Make sure all those PTE_P bits are zero.
memset(pgtab, 0, PGSIZE);
// The permissions here are overly generous, but they can
// be further restricted by the permissions in the page table
// entries, if necessary.
*pde = V2P(pgtab) | PTE_P | PTE_W | PTE_U;
}
return &pgtab[PTX(va)];
}
After walking through all kmap
elements. setupkvm
returns and kvmalloc
will call switchkvm
to reload %cr3
.
The program flow of physical memory allocation can be represented as the figure below:
xv6 uses the concept of a pool of free memory, which slice all free memory as 4KB pages and organize them with a linked list. That’s convenient for xv6 to manage the resources.
First, main
calls kinit1
in kalloc.c
to set up the first 4 MB unlocked memory for main
to use. Then call kinit2
to set up the remainder with locked memory for allocating.
kinit1(end, P2V(4*1024*1024)); // phys page allocator
kinit2(P2V(4*1024*1024), P2V(PHYSTOP)); // must come after startothers()
// Initialization happens in two phases.
// 1. main() calls kinit1() while still using entrypgdir to place just
// the pages mapped by entrypgdir on free list.
// 2. main() calls kinit2() with the rest of the physical pages
// after installing a full page table that maps them on all cores.
void
kinit1(void *vstart, void *vend)
{
initlock(&kmem.lock, "kmem");
kmem.use_lock = 0;
freerange(vstart, vend);
}
void
kinit2(void *vstart, void *vend)
{
freerange(vstart, vend);
kmem.use_lock = 1;
}
Both kinit1
and kinit2
calls freerange
to split memory to pages.
void
freerange(void *vstart, void *vend)
{
char *p;
p = (char*)PGROUNDUP((uint)vstart);
for(; p + PGSIZE <= (char*)vend; p += PGSIZE)
kfree(p);
}
freerange
finally calls kfree
to join free pages to the linked list.
// Free the page of physical memory pointed at by v,
// which normally should have been returned by a
// call to kalloc(). (The exception is when
// initializing the allocator; see kinit above.)
void
kfree(char *v)
{
struct run *r;
if((uint)v % PGSIZE || v < end || V2P(v) >= PHYSTOP)
panic("kfree");
// Fill with junk to catch dangling refs.
memset(v, 1, PGSIZE);
if(kmem.use_lock)
acquire(&kmem.lock);
r = (struct run*)v;
r->next = kmem.freelist;
kmem.freelist = r;
if(kmem.use_lock)
release(&kmem.lock);
}
The structure for the run
is defined below, which only contains the pointer of next run
.
struct run {
struct run *next;
};
The program flow of creating process address space is relatively simple. Without showing some processes of reading ELF
, the flow can be represented as the figure below:
The creation of process address space is done by the system call exec
. exec
would read ELF
's message and find the according section.
Then, exec
calls allouvm
to allocates memory for each ELF segment.
if((sz = allocuvm(pgdir, sz, ph.vaddr + ph.memsz)) == 0)
// Allocate page tables and physical memory to grow process from oldsz to
// newsz, which need not be page aligned. Returns new size or 0 on error.
int
allocuvm(pde_t *pgdir, uint oldsz, uint newsz)
{
char *mem;
uint a;
if(newsz >= KERNBASE)
return 0;
if(newsz < oldsz)
return oldsz;
a = PGROUNDUP(oldsz);
for(; a < newsz; a += PGSIZE){
mem = kalloc();
if(mem == 0){
cprintf("allocuvm out of memory\n");
deallocuvm(pgdir, newsz, oldsz);
return 0;
}
memset(mem, 0, PGSIZE);
if(mappages(pgdir, (char*)a, PGSIZE, V2P(mem), PTE_W|PTE_U) < 0){
cprintf("allocuvm out of memory (2)\n");
deallocuvm(pgdir, newsz, oldsz);
kfree(mem);
return 0;
}
}
return newsz;
}
Finally, exec
calls loaduvm
to loads each segment into memory. In loaduvm
, it also uses walkpgdir
to find the physical address.
if(loaduvm(pgdir, (char*)ph.vaddr, ip, ph.off, ph.filesz) < 0)
// Load a program segment into pgdir. addr must be page-aligned
// and the pages from addr to addr+sz must already be mapped.
int
loaduvm(pde_t *pgdir, char *addr, struct inode *ip, uint offset, uint sz)
{
uint i, pa, n;
pte_t *pte;
if((uint) addr % PGSIZE != 0)
panic("loaduvm: addr must be page aligned");
for(i = 0; i < sz; i += PGSIZE){
if((pte = walkpgdir(pgdir, addr+i, 0)) == 0)
panic("loaduvm: address should exist");
pa = PTE_ADDR(*pte);
if(sz - i < PGSIZE)
n = sz - i;
else
n = PGSIZE;
if(readi(ip, P2V(pa), offset+i, n) != n)
return -1;
}
return 0;
}