#include <lwk/kernel.h>
#include <lwk/init.h>
#include <lwk/task.h>
#include <lwk/cpu.h>
#include <lwk/ptrace.h>
#include <lwk/string.h>
#include <lwk/delay.h>
#include <arch/processor.h>
#include <arch/desc.h>
#include <arch/proto.h>
#include <arch/i387.h>
#include <arch/apic.h>
#include <arch/tsc.h>

/**
 * Bitmap of CPUs that have been initialized.
 */
static cpumask_t cpu_initialized_map;

/**
 * Memory for STACKFAULT stacks, one for each CPU.
 */
char stackfault_stack[NR_CPUS][PAGE_SIZE]
__attribute__((section(".bss.page_aligned")));

/**
 * Memory for DOUBLEFAULT stacks, one for each CPU.
 */
char doublefault_stack[NR_CPUS][PAGE_SIZE]
__attribute__((section(".bss.page_aligned")));

/**
 * Memory for NMI stacks, one for each CPU.
 */
char nmi_stack[NR_CPUS][PAGE_SIZE]
__attribute__((section(".bss.page_aligned")));

/**
 * Memory for DEBUG stacks, one for each CPU.
 */
char debug_stack[NR_CPUS][PAGE_SIZE]
__attribute__((section(".bss.page_aligned")));

/**
 * Memory for MCE stacks, one for each CPU.
 */
char mce_stack[NR_CPUS][PAGE_SIZE]
__attribute__((section(".bss.page_aligned")));

/**
 * Initializes the calling CPU's Per-CPU Data Area (PDA).
 * When in kernel mode, each CPU's GS.base is loaded with the address of the
 * CPU's PDA. This allows data in the PDA to be accessed using segment relative
 * accesses, like:
 *
 *     movl $gs:pcurrent,%rdi	// move CPU's current task pointer to rdi
 *
 * This is similar to thread-local data for user-level programs.
 */
void __init
pda_init(unsigned int cpu, struct task_struct *task)
{
	struct x8664_pda *pda = cpu_pda(cpu);

	/* 
 	 * Point FS and GS at the NULL segment descriptor (entry 0) in the GDT.
 	 * x86_64 does away with a lot of segmentation cruftiness... there's no
 	 * need to set up specific GDT entries for FS or GS.
 	 */
	asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));

	/*
 	 * Load the address of this CPU's PDA into this CPU's GS_BASE model
 	 * specific register. Upon entry to the kernel, the SWAPGS instruction
 	 * is used to load the value from MSR_GS_BASE into the GS segment
 	 * register's base address (GS.base).  The user-level GS.base value
 	 * is stored in MSR_GS_BASE.  When the kernel is exited, SWAPGS is
 	 * called again.
 	 */
	mb();
	wrmsrl(MSR_GS_BASE, pda);
	mb();

	pda->cpunumber     = cpu;
	pda->pcurrent      = task;
	pda->active_aspace = task->aspace;
	pda->kernelstack   = (unsigned long)task - PDA_STACKOFFSET + TASK_SIZE;
	pda->mmu_state     = 0;
}

/**
 * Initializes the calling CPU's Control Register 4 (CR4).
 * The bootstrap assembly code has already partially setup this register.
 * We only touch the bits we care about, leaving the others untouched.
 */
static void __init
cr4_init(void)
{
	clear_in_cr4(
		X86_CR4_VME | /* Disable Virtual-8086 support/cruft */
		X86_CR4_PVI | /* Disable support for VIF flag */
		X86_CR4_TSD | /* Allow RDTSC instruction at user-level */
		X86_CR4_DE    /* Disable debugging extensions */
	);
}

/**
 * Initializes and installs the calling CPU's Global Descriptor Table (GDT).
 * Each CPU has its own GDT.
 */
static void __init
gdt_init(void)
{
	unsigned int cpu = this_cpu;

	/* The bootstrap CPU's GDT has already been setup */
	if (cpu != 0)
		memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
	cpu_gdt_descr[cpu].size = GDT_SIZE;

	/* Install the CPU's GDT */
	asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu]));

	/*
	 * Install the CPU's LDT... Local Descriptor Table.
	 * We have no need for a LDT, so we point it at the NULL descriptor.
	 */
	asm volatile("lldt %w0":: "r" (0));
}

/**
 * Installs the calling CPU's Local Descriptor Table (LDT).
 * All CPUs share the same IDT.
 */
static void __init
idt_init(void)
{
	/*
	 * The bootstrap CPU has already filled in the IDT table via the
	 * interrupts_init() call in setup.c. All we need to do is tell the CPU
	 * about it.
	 */
	asm volatile("lidt %0" :: "m" (idt_descr));
}

/**
 * Initializes and installs the calling CPU's Task State Segment (TSS).
 */
static void __init
tss_init(void)
{
	unsigned int       cpu  = this_cpu;
	struct tss_struct  *tss = &per_cpu(tss, cpu);
	int i;

	/*
	 * Initialize the CPU's Interrupt Stack Table.
	 * Certain exceptions and interrupts are handled with their own,
	 * known good stack. The IST holds the address of these stacks.
	 */
	tss->ist[STACKFAULT_STACK-1]  = (unsigned long)&stackfault_stack[cpu][0];
	tss->ist[DOUBLEFAULT_STACK-1] = (unsigned long)&doublefault_stack[cpu][0];
	tss->ist[NMI_STACK-1]         = (unsigned long)&nmi_stack[cpu][0];
	tss->ist[DEBUG_STACK-1]       = (unsigned long)&debug_stack[cpu][0];
	tss->ist[MCE_STACK-1]         = (unsigned long)&mce_stack[cpu][0];

	/*
	 * Initialize the CPU's I/O permission bitmap.
	 * The <= is required because the CPU will access up to 8 bits beyond
	 * the end of the IO permission bitmap.
	 */
	tss->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
	for (i = 0; i <= IO_BITMAP_LONGS; i++) 
		tss->io_bitmap[i] = ~0UL;

	/*
 	 * Install the CPU's TSS and load the CPU's Task Register (TR).
 	 * Each CPU has its own TSS.
 	 */
	set_tss_desc(cpu, tss);
	asm volatile("ltr %w0":: "r" (GDT_ENTRY_TSS*8));
}

/**
 * Initializes various Model Specific Registers (MSRs) of the calling CPU.
 */
static void __init
msr_init(void)
{
	/*
	 * Setup the MSRs needed to support the SYSCALL and SYSRET
	 * instructions. Really, you should read the manual to understand these
	 * gems. In summary, STAR and LSTAR specify the CS, SS, and RIP to
	 * install when the SYSCALL instruction is issued. They also specify the
	 * CS and SS to install on SYSRET.
	 *
	 * On SYSCALL, the x86_64 CPU control unit uses STAR to load CS and SS and
	 * LSTAR to load RIP. The old RIP is saved in RCX.
	 *
	 * On SYSRET, the control unit uses STAR to restore CS and SS.
	 * RIP is loaded from RCX.
	 *
	 * SYSCALL_MASK specifies the RFLAGS to clear on SYSCALL.
	 */
	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48 | /* SYSRET  CS+SS */
	                  ((u64)__KERNEL_CS)<<32); /* SYSCALL CS+SS */
	wrmsrl(MSR_LSTAR, asm_syscall);            /* SYSCALL RIP */
	wrmsrl(MSR_CSTAR, asm_syscall_ignore);     /* RIP for compat. mode */
	wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);

	/*
	 * Setup MSRs needed to support the PDA.
 	 * pda_init() initialized MSR_GS_BASE already. When the SWAPGS
 	 * instruction is issued, the x86_64 control unit atomically swaps
 	 * MSR_GS_BASE and MSR_KERNEL_GS_BASE. So, when we call SWAPGS to
 	 * exit the kernel, the value in MSR_KERNEL_GS_BASE will be loaded.
 	 * User-space will see MSR_FS_BASE and MSR_GS_BASE both set to 0.
 	 */
	wrmsrl(MSR_FS_BASE, 0);
	wrmsrl(MSR_KERNEL_GS_BASE, 0);
}

/**
 * Initializes the calling CPU's debug registers.
 */
static void __init
dbg_init(void)
{
	/*
 	 * Clear the CPU's debug registers.
 	 * DR[0-3] are Address-Breakpoint Registers
 	 * DR[4-5] are reserved and should not be used by software
 	 * DR6 is the Debug Status Register
 	 * DR7 is the Debug Control Register
 	 */
	set_debugreg(0UL, 0);
	set_debugreg(0UL, 1);
	set_debugreg(0UL, 2);
	set_debugreg(0UL, 3);
	set_debugreg(0UL, 6);
	set_debugreg(0UL, 7);
}

void __init
cpu_init(void)
{
	/*
 	 * Get a reference to the currently executing task and the ID of the
 	 * CPU being initialized.  We can't use the normal 'current' mechanism
 	 * since it relies on the PDA being initialized, which it isn't for all
 	 * CPUs other than the boot CPU (id=0). pda_init() is called below.
 	 */
	struct task_struct *me = get_current_via_RSP();
	unsigned int       cpu = me->cpu_id; /* logical ID */

	if (cpu_test_and_set(cpu, cpu_initialized_map))
		panic("CPU#%u already initialized!\n", cpu);
	printk(KERN_DEBUG "Initializing CPU#%u\n", cpu);

	pda_init(cpu, me);	/* per-cpu data area */
	identify_cpu();		/* determine cpu features via CPUID */
	cr4_init();		/* control register 4 */
	gdt_init();		/* global descriptor table */
	idt_init();		/* interrupt descriptor table */
	tss_init();		/* task state segment */
	msr_init();		/* misc. model specific registers */
	dbg_init();		/* debug registers */
	fpu_init();		/* floating point unit */
	lapic_init();		/* local advanced prog. interrupt controller */
	time_init();		/* detects CPU frequency, udelay(), etc. */
	barrier();		/* compiler memory barrier, avoids reordering */
}