diff --git a/arch/Kconfig b/arch/Kconfig
index 4218c4500b3..a247ddc32c8 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -32,6 +32,10 @@ config X86
 	select ATOMIC_OPERATIONS_BUILTIN
 	select HAS_DTS
 
+config X86_64
+	bool "x86_64 architecture"
+	select ATOMIC_OPERATIONS_BUILTIN
+
 config NIOS2
 	bool "Nios II Gen 2 architecture"
 	select ATOMIC_OPERATIONS_C
diff --git a/arch/x86_64/CMakeLists.txt b/arch/x86_64/CMakeLists.txt
new file mode 100644
index 00000000000..7975866fdeb
--- /dev/null
+++ b/arch/x86_64/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(X86_64_BASE_CFLAGS
+    -ffreestanding
+    -fno-pic
+    -fno-asynchronous-unwind-tables
+    -mno-sse
+    -mno-red-zone)
+
+add_subdirectory(core)
+
+zephyr_compile_options(${X86_64_BASE_CFLAGS} -mx32)
+
+zephyr_link_libraries(-mx32)
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
new file mode 100644
index 00000000000..d8c7441ffef
--- /dev/null
+++ b/arch/x86_64/Kconfig
@@ -0,0 +1,31 @@
+config ARCH
+	default "x86_64"
+
+config XUK_DEBUG
+	bool "Debug logging at lowest level"
+	default n
+	help
+	  When true, enables debug logging from the XUK layer in very
+	  early boot situations (including the 16 and 32 bit stub
+	  code) on the first serial port (115200 8n1) and VGA text
+	  console.  Also wires that output stream to the printk()
+	  function so it can be used before any console drivers are
+	  initialized.
+
+config XUK_APIC_TSC_SHIFT
+	int "Power-of-two divisor between TSC and APIC timer"
+	default 6
+	help
+	  Configures the precision of the APIC timer as a bit shift of
+	  the TSC frequency.  High values "slow down" the tick rate of
+	  the APIC timer and allow for longer timeouts at the expense
+	  of precision.
+
+config IRQ_OFFLOAD_VECTOR
+	int "Interrupt vector for irq_offload"
+	default 255
+	help
+	  This is the interrupt vector to use for the self-directed
+	  IPIs used to implement irq_offload().  Most apps will never
+	  change this.  It's configurable in case someone wants to
+	  play with its priority.
diff --git a/arch/x86_64/core/CMakeLists.txt b/arch/x86_64/core/CMakeLists.txt
new file mode 100644
index 00000000000..93ce5cbf2ed
--- /dev/null
+++ b/arch/x86_64/core/CMakeLists.txt
@@ -0,0 +1,71 @@
+zephyr_library()
+
+zephyr_library_sources(
+  x86_64.c
+  xuk.c
+  xuk-stubs-copy.c   # <-- generated, see below
+)
+
+set(incdir ${PROJECT_BINARY_DIR}/include/generated)
+
+# We want to include two non-x86_64 stubs as sections/symbols in our
+# link (one 16 bit code for SMP real mode bootstraping, the other a 32
+# bit hook for OS protected mode entry).  This is tedious to do with
+# the linker directly, so the mechanism picked here is to have a C
+# file (which really is all assembly) import them with ".incbin"
+# statements.  But I can't figure out how to add a dependency to a C
+# file directly, so we copy the file so it can live as a separate
+# dependency node we control.
+#
+add_custom_command(
+  OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/xuk-stubs-copy.c
+  COMMAND ${CMAKE_COMMAND} -E copy
+          ${CMAKE_CURRENT_SOURCE_DIR}/xuk-stubs.c
+          ${CMAKE_CURRENT_BINARY_DIR}/xuk-stubs-copy.c
+  DEPENDS ${incdir}/xuk-stub16.bin
+          ${incdir}/xuk-stub32.bin
+)
+
+add_custom_command(
+  OUTPUT ${incdir}/xuk-stub16.bin
+  COMMAND ${CMAKE_C_COMPILER} -m16 -Os ${X86_64_BASE_CFLAGS}  -imacros ${AUTOCONF_H}
+    -c ${CMAKE_CURRENT_SOURCE_DIR}/xuk-stub16.c
+    -o ${CMAKE_CURRENT_BINARY_DIR}/xuk-stub16.o
+  COMMAND ${CMAKE_OBJCOPY} -O binary -j .text
+    ${CMAKE_CURRENT_BINARY_DIR}/xuk-stub16.o
+    ${incdir}/xuk-stub16.bin
+)
+
+add_custom_command(
+  OUTPUT ${incdir}/xuk-stub32.bin
+  COMMAND ${CMAKE_C_COMPILER} -m32 -Os ${X86_64_BASE_CFLAGS} -imacros ${AUTOCONF_H}
+    -c ${CMAKE_CURRENT_SOURCE_DIR}/xuk-stub32.c
+    -o ${CMAKE_CURRENT_BINARY_DIR}/xuk-stub32.o
+  COMMAND ${CMAKE_C_COMPILER} -m32 ${X86_64_BASE_CFLAGS}
+    -Wl,--build-id=none -nostdlib -nodefaultlibs -nostartfiles
+    -T ${CMAKE_CURRENT_SOURCE_DIR}/xuk-stub32.ld
+    ${CMAKE_CURRENT_BINARY_DIR}/xuk-stub32.o
+    -o ${CMAKE_CURRENT_BINARY_DIR}/xuk-stub32.elf
+  COMMAND ${CMAKE_OBJCOPY} -O binary
+    ${CMAKE_CURRENT_BINARY_DIR}/xuk-stub32.elf
+    ${incdir}/xuk-stub32.bin
+)
+
+# The zephyr.elf file generated for an x86_64 binary is a 64 bit
+# binary, but Qemu requires a traditional i386 file (because the entry
+# point from multiboot is in 386 protected mode).  Do a relink dance
+# with objcopy to convert.  Note use of the same .incbin trick with
+# copy, per above.
+#
+set(qkernel_file ${CMAKE_BINARY_DIR}/zephyr-qemu.elf)
+add_custom_target(qemu_kernel_target DEPENDS ${qkernel_file})
+add_custom_command(
+  OUTPUT ${qkernel_file}
+  DEPENDS zephyr_prebuilt
+  COMMAND ${CMAKE_OBJCOPY} -O binary ${CMAKE_BINARY_DIR}/zephyr/zephyr.elf ${CMAKE_CURRENT_BINARY_DIR}/zephyr-qemu.bin
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/qemuinc.c ${CMAKE_CURRENT_BINARY_DIR}
+  COMMAND ${CMAKE_C_COMPILER} -m32 -c ${CMAKE_CURRENT_BINARY_DIR}/qemuinc.c -o ${CMAKE_CURRENT_BINARY_DIR}/zephyr-qemu.o
+  COMMAND ${CMAKE_C_COMPILER} -m32 -T ${CMAKE_CURRENT_SOURCE_DIR}/xuk64.ld
+          -Wl,--build-id=none -nostdlib -nodefaultlibs -nostartfiles
+          -o ${qkernel_file} ${CMAKE_CURRENT_BINARY_DIR}/zephyr-qemu.o
+  )
diff --git a/arch/x86_64/core/Makefile.xuk b/arch/x86_64/core/Makefile.xuk
new file mode 100644
index 00000000000..a9b393f6dec
--- /dev/null
+++ b/arch/x86_64/core/Makefile.xuk
@@ -0,0 +1,60 @@
+# Any linux host toolchain should work as a default
+CC ?= gcc
+OBJCOPY ?= objcopy
+QEMU ?= qemu-system-x86_64
+
+# No unwind tables is just to save size. No SSE is allowed because GCC
+# uses it for miscellaneous optimizations that aren't related to
+# floating point, and we don't want to take the traps except on
+# threads that definitely need it.  No red zone because it's
+# incompatible with traditional stack-based interrupt entry.
+CFLAGS =  -Os -I../include -std=c11 -ffreestanding -fno-pic -fno-asynchronous-unwind-tables -mno-sse -mno-red-zone
+
+LDFLAGS = -Wl,--build-id=none -nostdlib -nodefaultlibs -nostartfiles
+
+# This works great.  But note that distros ship no libgcc for the
+# target, so once we start to need stuff from that we'll need to move
+# to a custom cross compiler.
+ARCHFLAGS = -mx32
+
+# The default build target just links the stub files.  Broader OS
+# builds just care about these files.  The xuk.elf target is a
+# demonstration kernel.
+stubs: xuk-stub32.bin xuk-stub16.bin
+
+# First link the initial 32 bit stub, which goes at the front of our
+# image.
+xuk-stub32.bin: xuk-stub32.c *.h xuk-stub32.ld
+	$(CC) -Wall -m32 $(CFLAGS) -c xuk-stub32.c
+	$(CC) -m32 -T xuk-stub32.ld $(LDFLAGS) -o stub32.elf $(CFLAGS) xuk-stub32.o
+	$(OBJCOPY) -O binary stub32.elf $@
+
+# This is the main OS image, starting with the 32 bit stub and
+# containing all the 64 bit code.
+xuk.elf64: xuk-stub32.bin xuk-stub16.bin xuk.c xuk-stubs.c demo-kernel.c *.h xuk64.ld
+	$(CC) $(ARCHFLAGS) -Wall $(CFLAGS) -c xuk.c
+	$(CC) $(ARCHFLAGS) -Wall $(CFLAGS) -c xuk-stubs.c
+	$(CC) $(ARCHFLAGS) -Wall $(CFLAGS) -c demo-kernel.c
+	$(CC) $(ARCHFLAGS) -T xuk64.ld $(LDFLAGS) -o $@ $(CFLAGS) xuk.o xuk-stubs.o demo-kernel.o
+
+# Final step.  We now have an x86_64 ELF binary, which is not a valid
+# multiboot image as the entry point is of course 32 bit.  It needs to
+# be a i386 image, so copy out the segment and relink the blob one
+# last time.
+xuk.elf: xuk.elf64 xuk64.ld
+	$(OBJCOPY) -O binary $< xuk.bin
+	echo '.incbin "xuk.bin"' | as --32 -c - -o xuk32.o
+	$(CC) -m32 -T xuk64.ld $(LDFLAGS) -o $@ $(CFLAGS) xuk32.o
+
+# We can rely on the bootloader to handover a machine running in 386
+# protected mode, but SMP cores start in real mode and need a tiny
+# bootstrap layer of 16 bit code.
+xuk-stub16.bin: xuk-stub16.c
+	$(CC) -m16 $(CFLAGS) -c $<
+	$(OBJCOPY) -O binary -j .text xuk-stub16.o $@
+
+run: xuk.elf
+	$(QEMU) -serial mon:stdio -smp cpus=2 -icount shift=1 -no-reboot -no-shutdown -d unimp,pcall,guest_errors -kernel $<
+
+clean:
+	rm -f *.elf *.elf64 *.o *~ *.bin *.disasm
diff --git a/arch/x86_64/core/demo-kernel.c b/arch/x86_64/core/demo-kernel.c
new file mode 100644
index 00000000000..7aaed2f2444
--- /dev/null
+++ b/arch/x86_64/core/demo-kernel.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "serial.h"
+#include "vgacon.h"
+#include "printf.h"
+#include "xuk.h"
+
+/* Tiny demonstration of the core64 code.  Implements enough of an
+ * "OS" layer to do some simple unit testing.
+ */
+
+static void putchar(int c)
+{
+	serial_putc(c);
+	vgacon_putc(c);
+}
+
+void test_timers(void)
+{
+	/* Quickly calibrate the timers against each other.  Note that
+	 * the APIC is counting DOWN instead of up!  Seems like on
+	 * qemu, the APIC base frequency is 3.7x slower than the tsc.
+	 * Looking at source, it seems like APIC is uniformly shifted
+	 * down from a nominal 1Ghz reference
+	 * (i.e. qemu_get_time_ns()), where the TSC is based on
+	 * cpu_get_ticks() and thus pulls in wall clock time & such.
+	 * If you specify "-icount shift=1", then they synchronize
+	 * properly.
+	 */
+	int tsc0, apic0, tsc1, apic1;
+
+	__asm__ volatile("rdtsc" : "=a"(tsc0) : : "rdx");
+	apic0 = _apic.CURR_COUNT;
+	do {
+		/* Qemu misbehaves if I spam these registers. */
+		for (int i = 0; i < 1000; i++) {
+			__asm__ volatile("nop");
+		}
+
+		__asm__ volatile("rdtsc" : "=a"(tsc1) : : "rdx");
+		apic1 = _apic.CURR_COUNT;
+	} while ((tsc1 - tsc0) < 10000 || (apic0 - apic1) < 10000);
+	printf("tsc %d apic %d\n", tsc1 - tsc0, apic0 - apic1);
+}
+
+unsigned int _init_cpu_stack(int cpu)
+{
+	return (long)alloc_page(0) + 4096;
+}
+
+void handler_timer(void *arg, int err)
+{
+	printf("Timer expired on CPU%d\n", (int)(long)xuk_get_f_ptr());
+}
+
+void handler_f3(void *arg, int err)
+{
+	printf("f3 handler on cpu%d arg %x, triggering INT 0xff\n",
+	       (int)(long)xuk_get_f_ptr(), (int)(long)arg);
+	__asm__ volatile("int $0xff");
+	printf("end f3 handler\n");
+}
+
+void _unhandled_vector(int vector, int err, struct xuk_entry_frame *f)
+{
+	(void)f;
+	_putchar = putchar;
+	printf("Unhandled vector %d (err %xh) on CPU%d\n",
+	       vector, err, (int)(long)xuk_get_f_ptr());
+}
+
+void _isr_entry(void)
+{
+}
+
+void *_isr_exit_restore_stack(void *interrupted)
+{
+	/* Somewhat hacky test of the ISR exit modes.  Two ways of
+	 * specifying "this stack", one of which does the full spill
+	 * and restore and one shortcuts that due to the NULL
+	 * return
+	 */
+	if (rdtsc() & 1) {
+		return interrupted;
+	} else {
+		return 0;
+	}
+}
+
+void *switch_back_to;
+
+void switch_back(int arg1, int arg2, int arg3)
+{
+	printf("Switching back (%d, %d, %d) sbt %xh\n",
+	       arg1, arg2, arg3, (int)(long)switch_back_to);
+	xuk_switch(switch_back_to, &switch_back_to);
+}
+
+void test_switch(void)
+{
+	static unsigned long long stack[256];
+	long args[] = { 5, 4, 3 };
+	int eflags = 0x20; /* interrupts disabled */
+
+	long handle = xuk_setup_stack((long)(sizeof(stack) + (char *)stack),
+				      switch_back, eflags, args, 3);
+
+	printf("Switching to %xh (stack %xh)\n",
+	       (int)handle, (int)(long)&stack[0]);
+	__asm__ volatile("cli");
+	xuk_switch((void *)handle, &switch_back_to);
+	__asm__ volatile("sti");
+	printf("Back from switch\n");
+}
+
+void local_ipi_handler(void *arg, int err)
+{
+	printf("local IPI handler on CPU%d\n", (int)(long)xuk_get_f_ptr());
+}
+
+/* Sends an IPI to the current CPU and validates it ran */
+void test_local_ipi(void)
+{
+	printf("Testing a local IPI on CPU%d\n", (int)(long)xuk_get_f_ptr());
+
+	_apic.ICR_HI = (struct apic_icr_hi) {};
+	_apic.ICR_LO = (struct apic_icr_lo) {
+		.delivery_mode = FIXED,
+		.vector = 0x90,
+		.shorthand = SELF,
+	};
+}
+
+void _cpu_start(int cpu)
+{
+	_putchar = putchar;
+	printf("Entering demo kernel\n");
+
+	/* Make sure the FS/GS pointers work, then set F to store our
+	 * CPU ID
+	 */
+	xuk_set_f_ptr(cpu, (void *)(long)(0x19283700 + cpu));
+	xuk_set_g_ptr(cpu, (void *)(long)(0xabacad00 + cpu));
+	printf("fptr %p gptr %p\n", xuk_get_f_ptr(), xuk_get_g_ptr());
+
+	xuk_set_f_ptr(cpu, (void *)(long)cpu);
+
+	/* Set up this CPU's timer */
+	/* FIXME: this sets up a separate vector for every CPU's
+	 * timer, and we'll run out.  They should share the vector but
+	 * still have individually-set APIC config.  Probably wants a
+	 * "timer" API
+	 */
+	xuk_set_isr(INT_APIC_LVT_TIMER, 10, handler_timer, 0);
+	_apic.INIT_COUNT = 5000000;
+	test_timers();
+
+	if (cpu == 0) {
+		xuk_set_isr(0x1f3, 0, (void *)handler_f3, (void *)0x12345678);
+	}
+
+	__asm__ volatile("int $0xf3");
+
+	/* Fire it all up */
+	printf("Enabling Interrupts\n");
+	__asm__ volatile("sti");
+	printf("Interrupts are unmasked (eflags %xh), here we go...\n",
+	       eflags());
+
+	/* Wait a teeny bit then send an IPI to CPU0, which will hit
+	 * the unhandled_vector handler
+	 */
+	if (cpu == 1) {
+		int t0 = rdtsc();
+
+		while (rdtsc() - t0 < 1000000) {
+		}
+
+		_apic.ICR_HI = (struct apic_icr_hi) {
+			.destination = 0
+		};
+		_apic.ICR_LO = (struct apic_icr_lo) {
+			.delivery_mode = FIXED,
+			.vector = 66,
+		};
+		while (_apic.ICR_LO.send_pending) {
+		}
+	}
+
+	test_switch();
+
+	xuk_set_isr(XUK_INT_RAW_VECTOR(0x90), -1, local_ipi_handler, 0);
+	test_local_ipi();
+
+	printf("CPU%d initialized, sleeping\n", cpu);
+	while (1) {
+		__asm__ volatile("hlt");
+	}
+}
diff --git a/arch/x86_64/core/offsets/offsets.c b/arch/x86_64/core/offsets/offsets.c
new file mode 100644
index 00000000000..d5921dc50c1
--- /dev/null
+++ b/arch/x86_64/core/offsets/offsets.c
@@ -0,0 +1,5 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
diff --git a/arch/x86_64/core/printf.h b/arch/x86_64/core/printf.h
new file mode 100644
index 00000000000..0d7fe02bb53
--- /dev/null
+++ b/arch/x86_64/core/printf.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <stdarg.h>
+
+/* Tiny, but not-as-primitive-as-it-looks implementation of something
+ * like s/n/printf().  Handles %d, %x, %c and %s only, no precision
+ * specifiers or type modifiers.
+ */
+
+struct _pfr {
+	char *buf;
+	int len;
+	int idx;
+};
+
+/* Set this function pointer to something that generates output */
+static void (*_putchar)(int c);
+
+static void pc(struct _pfr *r, int c)
+{
+	if (r->buf) {
+		if (r->idx <= r->len)
+			r->buf[r->idx] = c;
+	} else {
+		_putchar(c);
+	}
+	r->idx++;
+}
+
+static void prdec(struct _pfr *r, int v)
+{
+	if (v < 0) {
+		pc(r, '-');
+		v = -v;
+	}
+
+	char digs[11];
+	int i = 10;
+
+	digs[i--] = 0;
+	while (v || i == 9) {
+		digs[i--] = '0' + (v % 10);
+		v /= 10;
+	}
+
+	while (digs[++i])
+		pc(r, digs[i]);
+}
+
+static void endrec(struct _pfr *r)
+{
+	if (r->buf && r->idx < r->len)
+		r->buf[r->idx] = 0;
+}
+
+static int _vpf(struct _pfr *r, const char *f, va_list ap)
+{
+	for (/**/; *f; f++) {
+		if (*f != '%') {
+			pc(r, *f);
+			continue;
+		}
+
+		switch (*(++f)) {
+		case '%':
+			pc(r, '%');
+			break;
+		case 'c':
+			pc(r, va_arg(ap, int));
+			break;
+		case 's': {
+			char *s = va_arg(ap, char *);
+
+			while (*s)
+				pc(r, *s++);
+			break;
+		}
+		case 'p':
+			pc(r, '0');
+			pc(r, 'x'); /* fall through... */
+		case 'x': {
+			int sig = 0;
+			unsigned int v = va_arg(ap, unsigned int);
+
+			for (int i = 7; i >= 0; i--) {
+				int d = (v >> (i*4)) & 0xf;
+
+				sig += !!d;
+				if (sig || i == 0)
+					pc(r, "0123456789abcdef"[d]);
+			}
+			break;
+		}
+		case 'd':
+			prdec(r, va_arg(ap, int));
+			break;
+		default:
+			pc(r, '%');
+			pc(r, *f);
+		}
+	}
+	endrec(r);
+	return r->idx;
+}
+
+#define CALL_VPF(rec)				\
+	va_list ap;				\
+	va_start(ap, f);			\
+	int ret = _vpf(&r, f, ap);		\
+	va_end(ap);				\
+	return ret
+
+static inline int snprintf(char *buf, unsigned long len, const char *f, ...)
+{
+	struct _pfr r = { .buf = buf, .len = len };
+
+	CALL_VPF(&r);
+}
+
+static inline int sprintf(char *buf, const char *f, ...)
+{
+	struct _pfr r = { .buf = buf, .len = 0x7fffffff };
+
+	CALL_VPF(&r);
+}
+
+static inline int printf(const char *f, ...)
+{
+	struct _pfr r = {0};
+
+	CALL_VPF(&r);
+}
diff --git a/arch/x86_64/core/qemuinc.c b/arch/x86_64/core/qemuinc.c
new file mode 100644
index 00000000000..4dbccede4ce
--- /dev/null
+++ b/arch/x86_64/core/qemuinc.c
@@ -0,0 +1,11 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* This file exists solely to include a single binary blob in a link,
+ * used by the qemu kernel file architecture swap code in the cmake
+ * configuration.
+ */
+
+__asm__(".incbin \"zephyr-qemu.bin\"");
diff --git a/arch/x86_64/core/serial.h b/arch/x86_64/core/serial.h
new file mode 100644
index 00000000000..fcc682c5a9c
--- /dev/null
+++ b/arch/x86_64/core/serial.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "x86_64-hw.h"
+
+/* Super-primitive 8250 serial output-only driver, 115200 8n1 */
+
+#define _PORT 0x3f8
+
+static inline void _serout(int c)
+{
+	while (!(ioport_in8(_PORT + 5) & 0x20)) {
+	}
+	ioport_out8(_PORT, c);
+}
+
+static inline void serial_putc(int c)
+{
+	if (c == '\n') {
+		_serout('\r');
+	}
+	_serout(c);
+}
+
+static inline void serial_puts(const char *s)
+{
+	while (*s) {
+		serial_putc(*s++);
+	}
+}
+
+static inline void serial_init(void)
+{
+	/* In fact Qemu already has most of this set up and works by
+	 * default
+	 */
+	ioport_out8(_PORT+1, 0);    /* IER = 0 */
+	ioport_out8(_PORT+3, 0x80); /* LCR = 8n1 + DLAB select */
+	ioport_out8(_PORT,   1);    /* Divisor Latch low byte */
+	ioport_out8(_PORT+1, 0);    /* Divisor Latch high byte */
+	ioport_out8(_PORT+3, 0x03); /* LCR = 8n1 + DLAB off */
+	ioport_out8(_PORT+4, 0x03); /* MCR = DTR & RTS asserted */
+}
diff --git a/arch/x86_64/core/shared-page.h b/arch/x86_64/core/shared-page.h
new file mode 100644
index 00000000000..253ab442b7b
--- /dev/null
+++ b/arch/x86_64/core/shared-page.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _SHARED_PAGE_H
+#define _SHARED_PAGE_H
+
+/* Defines a simple interface for sharing a single page of data across
+ * CPU modes and SMP cores where it can be easily found and relied
+ * upon.
+ */
+
+#include "xuk-config.h"
+#include "x86_64-hw.h"
+
+/* The shared block lives in the 5th page of memory, immediately after
+ * the 16k null guard region
+ */
+#define SHARED_ADDR 0x4000
+
+/* Magic cookies passed to stub32 to tell it what's going on */
+#define BOOT_MAGIC_MULTIBOOT 0x2badb002 /* initial handoff from bootloader */
+#define BOOT_MAGIC_STUB16    0xaaf08df7 /* AP cpu initialization */
+
+struct xuk_shared_mem {
+	/* Stack to be used by SMP cpus at startup.  MUST BE FIRST. */
+	unsigned int smpinit_stack;
+
+	/* Spinlock used to serialize SMP initialization */
+	int smpinit_lock;
+
+	/* Byte address of next page to allocate */
+	unsigned int next_page;
+
+	/* Top-level page table address */
+	unsigned int base_cr3;
+
+	/* 64 bit GDT */
+	struct gdt64 gdt[3 + (2 * CONFIG_MP_NUM_CPUS)];
+
+	/* 64 bit IDT */
+	unsigned int idt_addr;
+
+	/* Precomputed GDT for the 16 bit stub */
+	unsigned int gdt16_addr;
+
+	/* Each pointer in these arrays is the base of the FS/GS
+	 * segment for the indexed CPU.
+	 */
+	long long fs_ptrs[CONFIG_MP_NUM_CPUS];
+	long long gs_ptrs[CONFIG_MP_NUM_CPUS];
+
+	int num_active_cpus;
+
+	/* Current output column in the VGA console */
+	int vgacol;
+};
+
+#define _shared (*((struct xuk_shared_mem *)(long)SHARED_ADDR))
+
+static inline void shared_init(void)
+{
+	for (int i = 0; i < sizeof(_shared)/sizeof(int); i++) {
+		((int *)&_shared)[i] = 0;
+	}
+
+	_shared.next_page = 0x5000;
+	_shared.vgacol = 80;
+}
+
+static inline void *alloc_page(int clear)
+{
+	int *p = (int *)(long)_shared.next_page;
+
+	_shared.next_page += 4096;
+
+	for (int i = 0; clear && i < 1024; i++) {
+		p[i] = 0;
+	}
+
+	return p;
+}
+
+#endif /* _SHARED_PAGE_H */
diff --git a/arch/x86_64/core/vgacon.h b/arch/x86_64/core/vgacon.h
new file mode 100644
index 00000000000..86b1c852aa2
--- /dev/null
+++ b/arch/x86_64/core/vgacon.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "shared-page.h"
+
+/* Super-primitive VGA text console output-only "terminal" driver */
+
+static inline unsigned short *_vga_row(int row)
+{
+	return ((unsigned short *)0xb8000) + 80 * row;
+}
+
+/* Foreground color is four bit, high to low: "intensity", red, green,
+ * blue.  Normal text is low intensity, so 0b0111 (7) is standard.
+ * The high nybble is the background color.
+ */
+static inline void vga_put(int ch, int color, int row, int col)
+{
+	unsigned short *rp = _vga_row(row);
+
+	rp[col] = (color << 8) | ch;
+}
+
+static inline void vgacon_putc(char c)
+{
+	if (_shared.vgacol == 80) {
+		for (int r = 0; r < 24; r++) {
+			for (int c = 0; c < 80; c++) {
+				_vga_row(r)[c] = _vga_row(r+1)[c];
+			}
+		}
+		for (int c = 0; c < 80; c++) {
+			_vga_row(24)[c] = 0x9000;
+		}
+		_shared.vgacol = 0;
+	}
+
+	if (c == '\n') {
+		_shared.vgacol = 80;
+	} else if (c == '\r') {
+		_shared.vgacol = 0;
+	} else {
+		vga_put(c, 0x1f, 24, _shared.vgacol++);
+	}
+}
diff --git a/arch/x86_64/core/x86_64-hw.h b/arch/x86_64/core/x86_64-hw.h
new file mode 100644
index 00000000000..229cb9a0552
--- /dev/null
+++ b/arch/x86_64/core/x86_64-hw.h
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _X86_64_HW_H
+#define _X86_64_HW_H
+
+/*
+ * Struct declarations and helper inlines for core x86_64 hardware
+ * functionality.  Anything related to ioports, CR's MSR's, I/L/GDTs,
+ * PTEs or (IO-)APICs can be found here.  Note that because this
+ * header is included in limited stub contexts, it should include
+ * declarations and inlines only: no data definitions, even extern
+ * ones!
+ */
+
+static inline unsigned long eflags(void)
+{
+	int eflags;
+
+	__asm__ volatile("pushfq; pop %%rax" : "=a"(eflags));
+	return eflags;
+}
+
+/* PAE page table record.  Note that "addr" is aligned naturally as an
+ * address, but of course must be masked to change only significant
+ * bits (which depend on whether it's storing a 4k, 2M or 1G memory
+ * block) so as to not clobber the bitfields (remember "negative"
+ * addresses must mask off the top bits too!).  The natural idiom is
+ * to assign addr first, then write the bitfields.
+ */
+struct pte64 {
+	union {
+		unsigned long long addr;
+		struct {
+			unsigned long long present : 1;
+			unsigned long long writable : 1;
+			unsigned long long usermode : 1;
+			unsigned long long writethrough : 1;
+			unsigned long long uncached : 1;
+			unsigned long long accessed : 1;
+			unsigned long long dirty : 1;
+			unsigned long long pagesize_pat : 1;
+			unsigned long long global : 1;
+			unsigned long long _UNUSED1 : 3;
+			unsigned long long pat : 1;
+			unsigned long long _UNUSED2 : 50;
+			unsigned long long exdisable : 1;
+		};
+	};
+};
+
+struct gdt64 {
+	union {
+		unsigned int dwords[2];
+		struct {
+			unsigned long long limit_lo16 : 16;
+			unsigned long long base_lo16 : 16;
+			unsigned long long base_mid8 : 8;
+			unsigned long long accessed : 1;
+			unsigned long long readable : 1;
+			unsigned long long conforming : 1;
+			unsigned long long codeseg : 1;
+			unsigned long long notsystem : 1;
+			unsigned long long ring : 2;
+			unsigned long long present : 1;
+			unsigned long long limit_hi4 : 4;
+			unsigned long long available : 1;
+			unsigned long long long64 : 1;
+			unsigned long long default_size : 1;
+			unsigned long long page_granularity : 1;
+			unsigned long long base_hi8 : 8;
+		};
+	};
+};
+
+static inline void gdt64_set_base(struct gdt64 *g, unsigned int base)
+{
+	g->base_lo16 = base & 0xffff;
+	g->base_mid8 = (base >> 16) & 0xff;
+	g->base_hi8 = base >> 24;
+}
+
+#define GDT_SELECTOR(seg) ((seg) << 3)
+
+struct idt64 {
+	unsigned short offset_lo16;
+	unsigned short segment;
+	unsigned int ist : 3;
+	unsigned int _UNUSED1 : 5;
+	unsigned int type : 4;
+	unsigned int _UNUSED2 : 1;
+	unsigned int ring : 2;
+	unsigned int present : 1;
+	unsigned short offset_mid16;
+	unsigned int offset_hi32;
+	unsigned int _UNUSED3;
+};
+
+static inline void idt64_set_isr(struct idt64 *desc, void *isr)
+{
+	unsigned long long addr = (unsigned long)isr;
+
+	desc->offset_lo16 = addr & 0xffff;
+	desc->offset_mid16 = (addr >> 16) & 0xffff;
+	desc->offset_hi32 = addr >> 32;
+}
+
+enum apic_delivery_mode {
+	FIXED = 0, LOWEST = 1, SMI = 2, NMI = 4,
+	INIT = 5, STARTUP = 6, EXTINT = 7,
+};
+
+struct apic_icr_lo {
+	unsigned int vector : 8;
+	enum apic_delivery_mode delivery_mode : 3;
+	unsigned int logical : 1;
+	unsigned int send_pending : 1;
+	unsigned int _unused : 1;
+	unsigned int assert : 1;
+	unsigned int level_trig : 1;
+	unsigned int _unused2 : 2;
+	enum { NONE, SELF, ALL, NOTSELF } shorthand : 2;
+};
+
+struct apic_icr_hi {
+	unsigned int _unused : 24;
+	unsigned int destination : 8;
+};
+
+/* Generic struct, not all field applicable to all LVT interrupts */
+struct apic_lvt {
+	unsigned int vector : 8;
+	enum apic_delivery_mode delivery_mode : 4;
+	unsigned int _UNUSED : 1;
+	unsigned int send_pending : 1;
+	unsigned int polarity : 1;
+	unsigned int remote_irr : 1;
+	unsigned int level_trig : 1;
+	unsigned int masked : 1;
+	enum { ONESHOT, PERIODIC, TSCDEADLINE } mode : 2;
+};
+
+/* Memory-mapped local APIC registers.  Note that the registers are
+ * always the first dword in a 16 byte block, the other 3 being
+ * unused.  So each line represents one of these registers, or an
+ * array thereof.  Lots of (_u)nused fields in the layout, but the usage
+ * becomes pleasingly clean.
+ */
+struct apic_regs {
+	unsigned int _u1[4][2];
+	unsigned int ID, _u2[3];
+	unsigned int VER, _u3[3];
+	unsigned int _u4[4][4];
+	unsigned int TPR, _u5[3];
+	unsigned int APR, _u6[3];
+	unsigned int PPR, _u7[3];
+	unsigned int EOI, _u8[3];
+	unsigned int RRD, _u9[3];
+	unsigned int LDR, _u10[3];
+	unsigned int DFR, _u11[3];
+	unsigned int SPURIOUS, _u12[3];
+	unsigned int ISR_BITS[4][8];
+	unsigned int TMR_BITS[4][8];
+	unsigned int IRR_BITS[4][8];
+	unsigned int ERR_STATUS, _u13[3];
+	unsigned int _u14[4][6];
+	struct apic_lvt LVT_CMCI; unsigned int _u15[3];
+	struct apic_icr_lo ICR_LO, _u16[3];
+	struct apic_icr_hi ICR_HI, _u17[3];
+	struct apic_lvt LVT_TIMER; unsigned int _u18[3];
+	struct apic_lvt LVT_THERMAL; unsigned int _u19[3];
+	struct apic_lvt LVT_PERF; unsigned int _u20[3];
+	struct apic_lvt LVT_LINT0; unsigned int _u21[3];
+	struct apic_lvt LVT_LINT1; unsigned int _u22[3];
+	struct apic_lvt LVT_ERROR; unsigned int _u23[3];
+	unsigned int INIT_COUNT, _u24[3];
+	unsigned int CURR_COUNT, _u25[3];
+	unsigned int _u26[4][4];
+	unsigned int DIVIDE_CONF, _u27[3];
+};
+
+#define _apic (*((volatile struct apic_regs *)0xfee00000ll))
+
+/* Crazy encoding for this, but susceptable to a formula.  Returns the
+ * DIVIDE_CONF register value that divides the input clock by 2^n (n
+ * in the range 0-7).
+ */
+#define APIC_DIVISOR(n) (((((n) - 1) << 1) & 8)|(((n) - 1) & 3))
+
+#define IOREGSEL (*(volatile unsigned int *)0xfec00000l)
+#define IOREGWIN (*(volatile unsigned int *)0xfec00010l)
+
+/* Assumes one IO-APIC.  Note that because of the way the register API
+ * works, this must be spinlocked or otherwise protected against other
+ * CPUs (e.g. do it all on cpu0 at startup, etc...).
+ */
+static inline unsigned int ioapic_read(int reg)
+{
+	IOREGSEL = reg;
+	return IOREGWIN;
+}
+
+static inline void ioapic_write(int reg, unsigned int val)
+{
+	IOREGSEL = reg;
+	IOREGWIN = val;
+}
+
+/* IOAPIC redirection table entry */
+struct ioapic_red {
+	union {
+		unsigned int regvals[2];
+		struct {
+			unsigned int vector : 8;
+			enum apic_delivery_mode : 3;
+			unsigned int logical : 1;
+			unsigned int send_pending : 1;
+			unsigned int active_low : 1;
+			unsigned int remote_irr : 1;
+			unsigned int level_triggered : 1;
+			unsigned int masked : 1;
+			unsigned int _UNUSED1 : 15;
+			unsigned int _UNUSED2 : 24;
+			unsigned int destination : 8;
+		};
+	};
+};
+
+#define GET_CR(reg) ({ unsigned int _r;					\
+			__asm__ volatile("movl %%" reg ", %0\n\t"	\
+					 : "=r"(_r));			\
+			_r; })
+
+#define SET_CR(reg, val)						\
+	do {								\
+		int tmp = val;						\
+		__asm__ volatile("movl %0, %%" reg "\n\t" :: "r"(tmp)); \
+	} while (0)
+
+#define SET_CR_BIT(reg, bit) SET_CR(reg, GET_CR(reg) | (1 << bit))
+
+static inline void ioport_out8(unsigned short port, unsigned char b)
+{
+	__asm__ volatile("outb %0, %1;\n\t" : : "a"(b), "d"(port));
+}
+
+
+static inline unsigned char ioport_in8(unsigned short port)
+{
+	unsigned char ret;
+
+	__asm__ volatile("inb %1, %0;\n\t" : "=a"(ret) : "d"(port));
+	return ret;
+}
+
+static inline void set_msr_bit(unsigned int msr, int bit)
+{
+	unsigned int mask = 1 << bit;
+
+	__asm__ volatile("rdmsr; or %0, %%eax; wrmsr"
+			 :: "r"(mask), "c"(msr) : "eax", "edx");
+}
+
+static inline unsigned int get_msr(unsigned int msr)
+{
+	unsigned int val;
+
+	__asm__ volatile("rdmsr" : "=a"(val) : "c"(msr) : "edx");
+	return val;
+}
+
+static inline unsigned long long rdtsc(void)
+{
+	unsigned long long rax, rdx;
+
+	__asm__ volatile("rdtsc" : "=a"(rax), "=d"(rdx));
+	return rdx << 32 | rax;
+}
+
+#endif /* _X86_64_HW_H */
diff --git a/arch/x86_64/core/x86_64.c b/arch/x86_64/core/x86_64.c
new file mode 100644
index 00000000000..9fa48455252
--- /dev/null
+++ b/arch/x86_64/core/x86_64.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <kernel_internal.h>
+#include <kernel_structs.h>
+#include <tracing.h>
+#include <ksched.h>
+#include <irq_offload.h>
+#include "xuk.h"
+
+struct device;
+
+struct NANO_ESF {
+};
+
+void _new_thread(struct k_thread *t, k_thread_stack_t *stack,
+		 size_t sz, k_thread_entry_t entry,
+		 void *p1, void *p2, void *p3,
+		 int prio, unsigned int opts)
+{
+	void *args[] = { entry, p1, p2, p3 };
+	int nargs = 4;
+	int eflags = 0x200;
+	char *base = K_THREAD_STACK_BUFFER(stack);
+	char *top = base + sz;
+
+	_new_thread_init(t, base, sz, prio, opts);
+
+	t->switch_handle = (void *)xuk_setup_stack((long) top,
+						   (void *)_thread_entry,
+						   eflags, (long *)args,
+						   nargs);
+}
+
+void k_cpu_idle(void)
+{
+	z_sys_trace_idle();
+	__asm__ volatile("sti; hlt");
+}
+
+void _unhandled_vector(int vector, int err, struct xuk_entry_frame *f)
+{
+	/* Yes, there are five regsiters missing.  See notes on
+	 * xuk_entry_frame/xuk_stack_frame.
+	 */
+	printk("*** FATAL ERROR vector %d code %d\n", vector, err);
+	printk("***  RIP %d:0x%llx RSP %d:0x%llx RFLAGS 0x%llx\n",
+	       (int)f->cs, f->rip, (int)f->ss, f->rsp, f->rflags);
+	printk("***  RAX 0x%llx RCX 0x%llx RDX 0x%llx RSI 0x%llx RDI 0x%llx\n",
+	       f->rax, f->rcx, f->rdx, f->rsi, f->rdi);
+	printk("***  R8 0x%llx R9 0x%llx R10 0x%llx R11 0x%llx\n",
+	       f->r8, f->r9, f->r10, f->r11);
+
+	_NanoFatalErrorHandler(x86_64_except_reason, NULL);
+}
+
+void _isr_entry(void)
+{
+	_arch_curr_cpu()->nested++;
+}
+
+void *_isr_exit_restore_stack(void *interrupted)
+{
+	bool nested = (--_arch_curr_cpu()->nested) > 0;
+	void *next = _get_next_switch_handle(interrupted);
+
+	return (nested || next == interrupted) ? NULL : next;
+}
+
+struct {
+	void (*fn)(int, void*);
+	void *arg;
+	unsigned int esp;
+} cpu_init[CONFIG_MP_NUM_CPUS];
+
+/* Called from Zephyr initialization */
+void _arch_start_cpu(int cpu_num, k_thread_stack_t *stack, int sz,
+		     void (*fn)(int, void *), void *arg)
+{
+	cpu_init[cpu_num].arg = arg;
+	cpu_init[cpu_num].esp = (int)(long)(sz + (char *)stack);
+
+	/* This is our flag to the spinning CPU.  Do this last */
+	cpu_init[cpu_num].fn = fn;
+}
+
+#ifdef CONFIG_IRQ_OFFLOAD
+static irq_offload_routine_t offload_fn;
+static void *offload_arg;
+
+static void irq_offload_handler(void *arg, int err)
+{
+	ARG_UNUSED(arg);
+	ARG_UNUSED(err);
+	offload_fn(offload_arg);
+}
+
+void irq_offload(irq_offload_routine_t fn, void *arg)
+{
+	offload_fn = fn;
+	offload_arg = arg;
+	__asm__ volatile("int %0" : : "i"(CONFIG_IRQ_OFFLOAD_VECTOR));
+}
+#endif
+
+/* Default.  Can be overridden at link time by a timer driver */
+void __weak x86_apic_timer_isr(void *arg, int code)
+{
+	ARG_UNUSED(arg);
+	ARG_UNUSED(code);
+}
+
+/* Called from xuk layer on actual CPU start */
+void _cpu_start(int cpu)
+{
+	xuk_set_f_ptr(cpu, &_kernel.cpus[cpu]);
+
+	/* Set up the timer ISR, but ensure the timer is disabled */
+	xuk_set_isr(INT_APIC_LVT_TIMER, 13, x86_apic_timer_isr, 0);
+	_apic.INIT_COUNT = 0;
+
+#ifdef CONFIG_IRQ_OFFLOAD
+	xuk_set_isr(XUK_INT_RAW_VECTOR(CONFIG_IRQ_OFFLOAD_VECTOR),
+		    -1, irq_offload_handler, 0);
+#endif
+
+	if (cpu <= 0) {
+		/* The SMP CPU startup function pointers act as init
+		 * flags.  Zero them here because this code is running
+		 * BEFORE .bss is zeroed!  Should probably move that
+		 * out of _Cstart() for this architecture...
+		 */
+		for (int i = 0; i < CONFIG_MP_NUM_CPUS; i++) {
+			cpu_init[i].fn = 0;
+		}
+
+		/* Enter Zephyr */
+		_Cstart();
+
+	} else if (cpu < CONFIG_MP_NUM_CPUS) {
+		/* SMP initialization.  First spin, waiting for
+		 * _arch_start_cpu() to be called from the main CPU
+		 */
+		while (!cpu_init[cpu].fn) {
+		}
+
+		/* Enter Zephyr, which will switch away and never return */
+		cpu_init[cpu].fn(0, cpu_init[cpu].arg);
+	}
+
+	/* Spin forever as a fallback */
+	while (1) {
+	}
+}
+
+/* Returns the initial stack to use for CPU startup on auxiliary (not
+ * cpu 0) processors to the xuk layer, which gets selected by the
+ * non-arch Zephyr kernel and stashed by _arch_start_cpu()
+ */
+unsigned int _init_cpu_stack(int cpu)
+{
+	return cpu_init[cpu].esp;
+}
+
+int _arch_irq_connect_dynamic(unsigned int irq, unsigned int priority,
+			      void (*routine)(void *parameter), void *parameter,
+			      u32_t flags)
+{
+	ARG_UNUSED(flags);
+	__ASSERT(priority >= 2 && priority <= 15,
+		 "APIC interrupt priority must be 2-15");
+
+	xuk_set_isr(irq, priority, (void *)routine, parameter);
+	return 0;
+}
+
+void _arch_irq_disable(unsigned int irq)
+{
+	xuk_set_isr_mask(irq, 1);
+}
+
+void _arch_irq_enable(unsigned int irq)
+{
+	xuk_set_isr_mask(irq, 0);
+}
+
+void x86_apic_set_timeout(u32_t cyc_from_now)
+{
+	_apic.INIT_COUNT = cyc_from_now;
+}
+
+const NANO_ESF _default_esf;
+
+int x86_64_except_reason;
+
+void _NanoFatalErrorHandler(unsigned int reason, const NANO_ESF *esf)
+{
+	_SysFatalErrorHandler(reason, esf);
+}
+
+/* App-overridable handler.  Does nothing here */
+void __weak _SysFatalErrorHandler(unsigned int reason, const NANO_ESF *esf)
+{
+	ARG_UNUSED(reason);
+	ARG_UNUSED(esf);
+	k_thread_abort(_current);
+}
diff --git a/arch/x86_64/core/xuk-config.h b/arch/x86_64/core/xuk-config.h
new file mode 100644
index 00000000000..7c1b9a38c21
--- /dev/null
+++ b/arch/x86_64/core/xuk-config.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _XUK_CONFIG_H
+#define _XUK_CONFIG_H
+
+/* This file defines "kconfig" variables used by the xuk layer only in
+ * unit test situations where we aren't using pulling in the true
+ * autoconf.h
+ */
+#ifndef CONFIG_X86_64
+
+/* #define CONFIG_XUK_DEBUG 1 */
+
+/* The APIC timer will run 2^X times slower than the TSC. (X = 0-7) */
+#define CONFIG_XUK_APIC_TSC_SHIFT 5
+
+#define CONFIG_MP_NUM_CPUS 2
+
+#define CONFIG_XUK_64_BIT_ABI 1
+
+#endif /* CONFIG_X86_64 */
+#endif /* _XUK_CONFIG_H */
diff --git a/arch/x86_64/core/xuk-stub16.c b/arch/x86_64/core/xuk-stub16.c
new file mode 100644
index 00000000000..0e341d1d60e
--- /dev/null
+++ b/arch/x86_64/core/xuk-stub16.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "serial.h"
+#include "x86_64-hw.h"
+#include "shared-page.h"
+
+/*
+ * 16 bit boot stub.  This code gets copied into a low memory page and
+ * used as the bootstrap code for SMP processors, which always start
+ * in real mode.  It is compiled with gcc's -m16 switch, which is a
+ * wrapper around the assembler's .code16gcc directive which cleverly
+ * takes 32 bit assembly and "fixes" it with appropriate address size
+ * prefixes to run in real mode on a 386.
+ *
+ * It is just code!  We have the .text segment and NOTHING ELSE.  No
+ * static or global variables can be used, nor const read-only data.
+ * Neither is the linker run, so nothing can be relocated and all
+ * symbolic references need to be to addresses within this file.  In
+ * fact, any relocations that do sneak in will be left at zero at
+ * runtime!
+ */
+
+__asm__("   cli\n"
+	"   xor %ax, %ax\n"
+	"   mov %ax, %ss\n"
+	"   mov %ax, %ds\n"
+	"   mov $80000, %esp\n" /* FIXME: put stack someplace officiallerish */
+	"   jmp _start16\n");
+
+void _start16(void)
+{
+#ifdef XUK_DEBUG
+	serial_putc('1'); serial_putc('6'); serial_putc('\n');
+#endif
+
+	/* First, serialize on a simple spinlock.  Note there's a
+	 * theoretical flaw here in that we are on a shared stack with the
+	 * other CPUs here and we don't *technically* know that "oldlock"
+	 * does not get written to the (clobberable!) stack memory.  But
+	 * in practice the compiler does the right thing here and we spin
+	 * in registers until exiting the loop, at which point we are the
+	 * only users of the stack, and thus safe.
+	 */
+	int oldlock;
+
+	do {
+		__asm__ volatile("pause; mov $1, %%eax; xchg %%eax, (%1)"
+				 : "=a"(oldlock) : "m"(_shared.smpinit_lock));
+	} while (oldlock);
+
+	/* Put a red banner at the top of the screen to announce our
+	 * presence
+	 */
+	volatile unsigned short *vga = (unsigned short *)0xb8000;
+
+	for (int i = 0; i < 240; i++)
+		vga[i] = 0xcc20;
+
+	/* Spin again waiting on the BSP processor to give us a stack.  We
+	 * won't use it until the entry code of stub32, but we want to
+	 * make sure it's there before we jump.
+	 */
+	while (!_shared.smpinit_stack) {
+	}
+
+	/* Load the GDT the CPU0 already prepared for us */
+	__asm__ volatile ("lgdtw (%0)\n" : : "r"(_shared.gdt16_addr));
+
+	/* Enter protected mode by setting the bottom bit of CR0 */
+	int cr0;
+
+	__asm__ volatile ("mov %%cr0, %0\n" : "=r"(cr0));
+	cr0 |= 1;
+	__asm__ volatile ("mov %0, %%cr0\n" : : "r"(cr0));
+
+	/* Set up data and stack segments */
+	short ds = GDT_SELECTOR(2);
+
+	__asm__ volatile ("mov %0, %%ds; mov %0, %%ss" : : "r"(ds));
+
+	/* Far jump to the 32 bit entry point, passing a cookie in EAX to
+	 * tell it what we're doing
+	 */
+	int magic = BOOT_MAGIC_STUB16;
+
+	__asm__ volatile ("ljmpl  $0x8,$0x100000" : : "a"(magic));
+
+	while (1) {
+		__asm__("hlt");
+	}
+}
diff --git a/arch/x86_64/core/xuk-stub32.c b/arch/x86_64/core/xuk-stub32.c
new file mode 100644
index 00000000000..f718cc746ee
--- /dev/null
+++ b/arch/x86_64/core/xuk-stub32.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "xuk-config.h"
+#include "shared-page.h"
+#include "x86_64-hw.h"
+
+#ifdef CONFIG_XUK_DEBUG
+#include "printf.h"
+#include "vgacon.h"
+#include "serial.h"
+#else
+int printf(const char *fmt, ...)
+{
+	return 0;
+}
+#endif
+
+/* This i386 code stub is designed to link internally (i.e. it shares
+ * nothing with the 64 bit world) and be loaded into RAM in high
+ * memory (generally at 0x100000) in a single (R/W/X) block with its
+ * .text, .rodata, .data and .bss included.  Its stack lives in the
+ * fifth page of memory at 0x04000-0x4fff.  After finishing 64 bit
+ * initialization, it will JMP to the 16-byte-aligned address that
+ * immediately follows this block in memory (exposed by the linker as
+ * _start64), which should then be able to run in an environment where
+ * all of physical RAM is mapped, except for the bottom 16kb.
+ *
+ * Memory layout on exit:
+ *
+ * + Pages 0-3   are an unmapped NULL guard
+ * + Page 4:     contains stack and bss for the setup code, and a GDT.
+ *               After 64 bit setup, it's likely this will be reused .
+ * + Pages 5-11: are the bootstrap page table
+ *
+ * Note that the initial page table makes no attempt to identify
+ * memory regions.  Everything in the first 4G is mapped as cachable
+ * RAM.  MMIO drivers will need to remap their memory based on PCI BAR
+ * regions or whatever.
+ */
+
+/* Cute trick to turn a preprocessor macro containing a number literal
+ * into a string immediate in gcc basic asm context
+ */
+#define _ASM_IMM(s) #s
+#define ASM_IMM(s) "$" _ASM_IMM(s)
+
+/* Entry point, to be linked at the very start of the image.  Set a
+ * known-good stack (either the top of the shared page for the boot
+ * CPU, or one provided by stub16 on others), push the multiboot
+ * arguments in EAX, EBX and call into C code.
+ */
+__asm__(".pushsection .start32\n"
+	"   mov $0x5000, %esp\n"
+	"   xor %edx, %edx\n"
+	"   cmp " ASM_IMM(BOOT_MAGIC_STUB16) ", %eax\n"
+	"   cmove 0x4000(%edx), %esp\n"
+	"   pushl %ebx\n"
+	"   pushl %eax\n"
+	"   call cstart\n"
+	".popsection\n");
+
+/* The multiboot header can be anywhere in the first 4k of the file.
+ * This stub doesn't get that big, so we don't bother with special
+ * linkage.
+ */
+#define MULTIBOOT_MAGIC 0x1badb002
+#define MULTIBOOT_FLAGS (1<<1) /* 2nd bit is "want memory map" */
+const int multiboot_header[] = {
+	MULTIBOOT_MAGIC,
+	MULTIBOOT_FLAGS,
+	-(MULTIBOOT_MAGIC + MULTIBOOT_FLAGS), /* csum: -(magic+flags) */
+};
+
+/* Creates and returns a generic/sane page table for 64 bit startup
+ * (64 bit mode requires paging enabled).  All of the bottom 4G
+ * (whether backing memory is present or not) gets a mapping with 2M
+ * pages, except that the bottom 2M are mapped with 4k pages and leave
+ * the first four pages unmapped as a NULL guard.
+ *
+ * Makes no attempt to identify non-RAM/MMIO regions, it just maps
+ * everything.  We rely on the firmware to have set up MTRRs for us
+ * where needed, otherwise that will all be cacheable memory.
+ */
+void *init_page_tables(void)
+{
+	/* Top level PML4E points to a single PDPTE in its first entry */
+	struct pte64 *pml4e = alloc_page(1);
+	struct pte64 *pdpte = alloc_page(1);
+
+	pml4e[0].addr = (unsigned long)pdpte;
+	pml4e[0].present = 1;
+	pml4e[0].writable = 1;
+
+	/* The PDPTE has four entries covering the first 4G of memory,
+	 * each pointing to a PDE
+	 */
+	for (unsigned int gb = 0; gb < 4; gb++) {
+		struct pte64 *pde = alloc_page(0);
+
+		pdpte[gb].addr = (unsigned long)pde;
+		pdpte[gb].present = 1;
+		pdpte[gb].writable = 1;
+
+		/* Each PDE filled with 2M supervisor pages */
+		for (int i = 0; i < 512; i++) {
+			if (!(gb == 0 && i == 0)) {
+				pde[i].addr = (gb << 30) | (i << 21);
+				pde[i].present = 1;
+				pde[i].writable = 1;
+				pde[i].pagesize_pat = 1;
+			} else {
+				/* EXCEPT the very first entry of the
+				 * first GB, which is a pointer to a
+				 * PTE of 4k pages so that we can have
+				 * a 16k (4-page) NULL guard unmapped.
+				 */
+				struct pte64 *pte = alloc_page(0);
+
+				pde[0].addr = (unsigned long)pte;
+				pde[0].present = 1;
+				pde[0].writable = 1;
+
+				for (int j = 0; j < 512; j++) {
+					if (j < 4) {
+						pte[j].addr = 0;
+					} else {
+						pte[j].addr = j << 12;
+						pte[j].present = 1;
+						pte[j].writable = 1;
+					}
+				}
+			}
+		}
+	}
+
+	/* Flush caches out of paranoia.  In theory, x86 page walking
+	 * happens downstream of the system-coherent dcache and this
+	 * isn't needed.
+	 */
+	__asm__ volatile("wbinvd");
+	return pml4e;
+}
+
+#ifdef CONFIG_XUK_DEBUG
+void putchar(int c)
+{
+	serial_putc(c);
+	vgacon_putc(c);
+}
+#endif
+
+void cstart(unsigned int magic, unsigned int arg)
+{
+	if (magic == BOOT_MAGIC_STUB16) {
+		printf("SMP CPU up in 32 bit protected mode.  Stack ~%xh\n",
+		       &magic);
+	}
+
+	if (magic != BOOT_MAGIC_STUB16) {
+		shared_init();
+#ifdef CONFIG_XUK_DEBUG
+		serial_init();
+		_putchar = putchar;
+#endif
+
+		printf("Entering stub32 on boot cpu, magic %xh stack ~%xh\n",
+		       magic, (int)&magic);
+	}
+
+	/* The multiboot memory map turns out not to be very useful.
+	 * The basic numbers logged here are only a subset of the true
+	 * memory map if it has holes or >4G memory, and the full map
+	 * passed in the second argument tends to live in low memory
+	 * and get easily clobbered by our own muckery.  If we care
+	 * about reading memory maps at runtime we probably want to be
+	 * using BIOS e820 like Linux does.
+	 */
+	if (magic == BOOT_MAGIC_MULTIBOOT) {
+		printf("Hi there!\n");
+		printf("This is a second line!\n");
+		printf("And this line was generated from %s\n", "printf!");
+
+		printf("Magic: %p MBI Addr: %p\n", magic, arg);
+
+		int mem_lower = *(int *)(arg + 4);
+		int mem_upper = *(int *)(arg + 8);
+		int mmap_length = *(int *)(arg + 44);
+		int *mmap_addr = *(void **)(arg + 48);
+
+		printf("mem lower %d upper %d mmap_len %d mmap_addr %p\n",
+		       mem_lower, mem_upper, mmap_length, mmap_addr);
+	}
+
+	/* Choose a stack pointer and CPU ID for the 64 bit code to
+	 * use.  Then if we're not the boot CPU, release the spinlock
+	 * (taken in stub16) so the other CPUs can continue.
+	 */
+	int cpu_id = 0;
+	unsigned int init_stack = 0x5000;
+
+	if (magic == BOOT_MAGIC_STUB16) {
+		cpu_id = _shared.num_active_cpus++;
+		init_stack = _shared.smpinit_stack;
+		_shared.smpinit_stack = 0;
+		__asm__ volatile("movl $0, (%0)" : : "m"(_shared.smpinit_lock));
+	}
+
+	/* Page table goes in CR3.  This is a noop until paging is
+	 * enabled later
+	 */
+	if (magic != BOOT_MAGIC_STUB16) {
+		_shared.base_cr3 = (unsigned int)init_page_tables();
+	}
+	SET_CR("cr3", _shared.base_cr3);
+
+	/* Enable PAE bit (5) in CR4, required because in long mode
+	 * we'll be using the 64 bit page entry format.  Likewise a
+	 * noop until the CPU starts loading pages.
+	 */
+	SET_CR_BIT("cr4", 5);
+
+	/* Set LME (long mode enable) in IA32_EFER.  Still not a mode
+	 * transition, simply tells the CPU that, once paging is
+	 * enabled, we should enter long mode.  At that point the LMA
+	 * bit (10) will be set to indicate that it's active.
+	 */
+	const int MSR_IA32_EFER = 0xc0000080;
+
+	set_msr_bit(MSR_IA32_EFER, 8);
+
+	/* NOW we transition by turning paging on.  The CPU will start
+	 * page translation (which has been carefully
+	 * identity-mapped!) and enter the 32 bit compatibility
+	 * submode of long mode.  So we're reading 64 bit page tables
+	 * but still executing 32 bit instructions.
+	 */
+	SET_CR_BIT("cr0", 31);
+
+	printf("Hello memory mapped world!\n");
+
+	/* Now we can enter true 64 bit long mode via a far call to a
+	 * code segment with the 64 bit flag set.  Allocate a 2-entry
+	 * GDT (entry 0 is always a "null segment" architecturally and
+	 * can't be used) here on the stack and throw it away after
+	 * the jump.  The 64 bit OS code will need to set the
+	 * descriptors up for itself anyway
+	 */
+	struct gdt64 cs[] = {
+		{ },
+		{
+		 .readable = 1,
+		 .codeseg = 1,
+		 .notsystem = 1,
+		 .present = 1,
+		 .long64 = 1,
+		 },
+	};
+
+	/* The limit comes first, but is 16 bits.  The dummy is there
+	 * for alignment, though docs aren't clear on whether it's
+	 * required or not
+	 */
+	struct {
+		unsigned short dummy;
+		unsigned short limit;
+		unsigned int addr;
+	} gdtp = { .limit = sizeof(cs), .addr = (int)&cs[0], };
+
+	printf("CS descriptor 0x%x 0x%x\n", cs[1].dwords[1], cs[1].dwords[0]);
+	__asm__ volatile("lgdt %0" : : "m"(gdtp.limit) : "memory");
+
+	/* Finally, make a far jump into the 64 bit world.  The entry
+	 * point is a 16-byte-aligned address that immediately follows
+	 * our stub, and is exposed by our linkage as "_start64".
+	 *
+	 * Indirect far jumps have a similar crazy setup to descriptor
+	 * tables, but here the segment selector comes last so no
+	 * alignment worries.
+	 *
+	 * The 64 bit entry reuses the same stack we're on, and takes
+	 * the cpu_id in its first argument.
+	 */
+	extern int _start64;
+	unsigned int jmpaddr = (unsigned int) &_start64;
+	struct {
+		unsigned int addr;
+		unsigned short segment;
+	} farjmp  = { .segment = GDT_SELECTOR(1), .addr = jmpaddr };
+
+	printf("Making far jump to 64 bit mode @%xh...\n", &_start64);
+	__asm__ volatile("mov %0, %%esp; ljmp *%1" ::
+			 "r"(init_stack), "m"(farjmp), "D"(cpu_id)
+			 : "memory");
+}
diff --git a/arch/x86_64/core/xuk-stub32.ld b/arch/x86_64/core/xuk-stub32.ld
new file mode 100644
index 00000000000..16e50761c63
--- /dev/null
+++ b/arch/x86_64/core/xuk-stub32.ld
@@ -0,0 +1,21 @@
+ENTRY(_start)
+
+PHDRS {
+      stub32 PT_LOAD;
+}
+
+SECTIONS {
+	. = 0x100000;
+	_start = .;
+	.stub32 : {
+		*(.start32)
+		*(.text*)
+		*(.rodata*)
+		*(.data*)
+		*(.bss*)
+		*(COMMON)
+	} :stub32
+
+	. = ALIGN(16);
+	_start64 = .;
+}
diff --git a/arch/x86_64/core/xuk-stubs.c b/arch/x86_64/core/xuk-stubs.c
new file mode 100644
index 00000000000..a4bc2ef9f8d
--- /dev/null
+++ b/arch/x86_64/core/xuk-stubs.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* This "C" file exists solely to include the contents of
+ * separately-compiled binary stubs into the link.  It's easier than
+ * trying to objcopy the contents into linkable object files,
+ * especially when combined with cmake's somewhat odd special-cased
+ * dependency handling (which works fine with C files, of course).
+ */
+
+/* The 32 bit stub is our entry point and goes into a separate linker
+ * section so it can be placed correctly
+ */
+__asm__(".section .xuk_stub32\n"
+	".incbin \"xuk-stub32.bin\"\n");
+
+/* The 16 bit stub is the start of execution for auxiliary SMP CPUs
+ * (also for real mode traps if we ever want to expose that
+ * capability) and just lives in rodata.  It has to be copied into low
+ * memory by the kernel once it is running.
+ */
+__asm__(".section .rodata\n"
+	".globl _xuk_stub16_start\n"
+	"_xuk_stub16_start:\n"
+	".incbin \"xuk-stub16.bin\"\n"
+	".globl _xuk_stub16_end\n"
+	"_xuk_stub16_end:\n");
+
diff --git a/arch/x86_64/core/xuk.c b/arch/x86_64/core/xuk.c
new file mode 100644
index 00000000000..8ffc06e9fd7
--- /dev/null
+++ b/arch/x86_64/core/xuk.c
@@ -0,0 +1,629 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "xuk-config.h"
+#include "x86_64-hw.h"
+#include "xuk.h"
+#include "serial.h"
+
+#ifdef CONFIG_XUK_DEBUG
+#include "vgacon.h"
+#include "printf.h"
+#else
+#define printf(...)
+#endif
+
+#define ARRAY_SIZE(a) (sizeof(a)/sizeof((a)[0]))
+
+/* Defined at the linker level in xuk-stubs.c */
+extern char _xuk_stub16_start, _xuk_stub16_end;
+
+/* 64 bit entry point.  Lives immediately after the 32 bit stub.
+ * Expects to have its stack already set up.
+ */
+__asm__(".pushsection .xuk_start64\n"
+	".align 16\n"
+	"    jmp _cstart64\n"
+	".popsection\n");
+
+/* Interrupt/exception entry points stored in the IDT.
+ *
+ * FIXME: the assembly below uses XCHG r/m, because I'm lazy and this
+ * was SO much easier than hand coding the musical chairs required to
+ * emulate it.  But that instruction is outrageously slow (like 20+
+ * cycle latency on most CPUs!), and this is interrupt entry.
+ * Replace, once we have a test available to detect bad register
+ * contents
+ */
+extern char _isr_entry_err, _isr_entry_noerr;
+__asm__(/* Exceptions that push an error code arrive here. */
+	".align 16\n"
+	"_isr_entry_err:\n"
+	"    xchg %rdx, (%rsp)\n"
+	"    jmp _isr_entry2\n"
+
+	/* IRQs with no error code land here, then fall through */
+	".align 16\n"
+	"_isr_entry_noerr:\n"
+	"    push %rdx\n"
+
+	/* Arrive here with RDX already pushed to the stack below the
+	 * interrupt frame and (if needed) populated with the error
+	 * code from the exception.  It will become the third argument
+	 * to the C handler.  Stuff the return address from the call
+	 * in the stub table into RDI (the first argument).
+	 */
+	"_isr_entry2:\n"
+	"    xchg %rdi, 8(%rsp)\n"
+	"    push %rax\n"
+	"    push %rcx\n"
+	"    push %rsi\n"
+	"    push %r8\n"
+	"    push %r9\n"
+	"    push %r10\n"
+	"    push %r11\n"
+	"    mov %rsp, %rsi\n" /* stack in second arg */
+	"    call _isr_c_top\n"
+
+	/* We have pushed only the caller-save registers at this
+	 * point.  Check return value to see if we are returning back
+	 * into the same context or if we need to do a full dump and
+	 * restore.
+	 */
+	"    test %rax, %rax\n"
+	"    jnz _switch_bottom\n"
+	"    pop %r11\n"
+	"    pop %r10\n"
+	"    pop %r9\n"
+	"    pop %r8\n"
+	"    pop %rsi\n"
+	"    pop %rcx\n"
+	"    pop %rax\n"
+	"    pop %rdx\n"
+	"    pop %rdi\n"
+	"    iretq\n");
+
+/* Top half of a context switch.  Arrive here with the "CPU pushed"
+ * part of the exception frame (SS, RSP, RFLAGS, CS, RIP) already on
+ * the stack, the context pointer to which to switch stored in RAX and
+ * a pointer into which to store the current context in RDX (NOTE:
+ * this will be a pointer to a 32 bit memory location if we are in x32
+ * mode!).  It will push the first half of the register set (the same
+ * caller-save registers pushed by an ISR) and then continue on to
+ * _switch_bottom to finish up.
+ */
+__asm__(".align 16\n"
+	".global _switch_top\n"
+	"_switch_top:\n"
+	"    push %rdi\n"
+	"    push %rdx\n"
+	"    push %rax\n"
+	"    push %rcx\n"
+	"    push %rsi\n"
+	"    push %r8\n"
+	"    push %r9\n"
+	"    push %r10\n"
+	"    push %r11\n"
+	"    mov %rsp, %r8\n"
+	"    sub $48, %r8\n"
+#ifdef CONFIG_XUK_64_BIT_ABI
+	"    movq %r8, (%rdx)\n"
+#else
+	"    movl %r8d, (%rdx)\n"
+#endif
+	/* Fall through... */
+	/* Bottom half of a switch, used by both ISR return and
+	 * context switching.  Arrive here with the exception frame
+	 * and caller-saved registers already on the stack and the
+	 * stack pointer to use for the restore in RAX.  It will push
+	 * the remaining registers and then restore.
+	 */
+	".align 16\n"
+	"_switch_bottom:\n"
+	"    push %rbx\n"
+	"    push %rbp\n"
+	"    push %r12\n"
+	"    push %r13\n"
+	"    push %r14\n"
+	"    push %r15\n"
+	"    mov %rax, %rsp\n"
+	"    pop %r15\n"
+	"    pop %r14\n"
+	"    pop %r13\n"
+	"    pop %r12\n"
+	"    pop %rbp\n"
+	"    pop %rbx\n"
+	"    pop %r11\n"
+	"    pop %r10\n"
+	"    pop %r9\n"
+	"    pop %r8\n"
+	"    pop %rsi\n"
+	"    pop %rcx\n"
+	"    pop %rax\n"
+	"    pop %rdx\n"
+	"    pop %rdi\n"
+	"    iretq\n");
+
+static unsigned int isr_stub_base;
+
+struct vhandler {
+	void (*fn)(void*, int);
+	void *arg;
+};
+
+static struct vhandler *vector_handlers;
+
+static void putchar(int c)
+{
+	serial_putc(c);
+#ifdef XUK_DEBUG
+	vgacon_putc(c);
+#endif
+}
+
+long _isr_c_top(unsigned long vecret, unsigned long rsp,
+		unsigned long err)
+{
+	/* The vector stubs are 8-byte-aligned, so to get the vector
+	 * index from the return address we just shift off the bottom
+	 * bits
+	 */
+	int vector = (vecret - isr_stub_base) >> 3;
+	struct vhandler *h = &vector_handlers[vector];
+	struct xuk_entry_frame *frame = (void *)rsp;
+
+	_isr_entry();
+
+	/* Set current priority in CR8 to the currently-serviced IRQ
+	 * and re-enable interrupts
+	 */
+	unsigned long long cr8, cr8new = vector >> 4;
+
+	__asm__ volatile("movq %%cr8, %0;"
+			 "movq %1, %%cr8;"
+			 "sti"
+			 : "=r"(cr8) : "r"(cr8new));
+
+	if (h->fn) {
+		h->fn(h->arg, err);
+	} else {
+		_unhandled_vector(vector, err, frame);
+	}
+
+	/* Mask interrupts to finish processing (they'll get restored
+	 * in the upcoming IRET) and restore CR8
+	 */
+	__asm__ volatile("cli; movq %0, %%cr8" : : "r"(cr8));
+
+	/* Signal EOI if it's an APIC-managed interrupt */
+	if (vector > 0x1f) {
+		_apic.EOI = 0;
+	}
+
+	/* Subtle: for the "interrupted context pointer", we pass in
+	 * the value our stack pointer WILL have once we finish
+	 * spilling registers after this function returns.  If this
+	 * hook doesn't want to switch, it will return null and never
+	 * save the value of the pointer.
+	 */
+	return (long)_isr_exit_restore_stack((void *)(rsp - 48));
+}
+
+static long choose_isr_entry(int vector)
+{
+	/* Constructed with 1's in the vector indexes defined to
+	 * generate an error code.  Couldn't find a clean way to make
+	 * the compiler generate this code
+	 */
+	const int mask = 0x27d00; /* 0b00100111110100000000 */
+
+	if (vector < 32 && ((1 << vector) & mask)) {
+		return (long)&_isr_entry_err;
+	} else {
+		return (long)&_isr_entry_noerr;
+	}
+}
+
+void xuk_set_isr(int interrupt, int priority,
+		 void (*handler)(void *, int), void *arg)
+{
+	int v = interrupt - 0x100;
+
+	/* Need to choose a vector number?  Try all vectors at the
+	 * specified priority.  Clobber one if we have to.
+	 */
+	if (interrupt < 0x100 || interrupt > 0x1ff) {
+		for (int pi = 0; pi <= 0xf; pi++) {
+			v = (priority << 4) | pi;
+			if (!vector_handlers[v].fn) {
+				break;
+			}
+		}
+	}
+
+	/* Need to set up IO-APIC?  Set it up to deliver to all CPUs
+	 * here (another API later will probably allow for IRQ
+	 * affinity).  Do a read/write cycle to avoid clobbering
+	 * settings like edge triggering & polarity that might have
+	 * been set up by other platform layers.  We only want to muck
+	 * with routing.
+	 */
+	if (interrupt < 0x100) {
+		struct ioapic_red red;
+		int regidx = 0x10 + 2 * interrupt;
+
+		red.regvals[0] = ioapic_read(regidx);
+		red.regvals[1] = ioapic_read(regidx + 1);
+		red.vector = v;
+		red.logical = 0;
+		red.destination = 0xff;
+		red.masked = 1;
+		ioapic_write(regidx, red.regvals[0]);
+		ioapic_write(regidx + 1, red.regvals[1]);
+	}
+
+	/* Is it a special interrupt? */
+	if (interrupt == INT_APIC_LVT_TIMER) {
+		struct apic_lvt lvt = {
+			.vector = v,
+			.mode = ONESHOT,
+		};
+
+		_apic.LVT_TIMER = lvt;
+	}
+
+	printf("set_isr v %d\n", v);
+
+	vector_handlers[v].fn = handler;
+	vector_handlers[v].arg = arg;
+}
+
+/* Note: "raw vector" interrupt numbers cannot be masked, as the APIC
+ * doesn't have a per-vector mask bit.  Only specific LVT interrupts
+ * (we handle timer below) and IOAPIC-generated interrupts can be
+ * masked on x86.  In practice, this isn't a problem as that API is a
+ * special-purpose kind of thing.  Real devices will always go through
+ * the supported channel.
+ */
+void xuk_set_isr_mask(int interrupt, int masked)
+{
+	if (interrupt == INT_APIC_LVT_TIMER) {
+		struct apic_lvt lvt = _apic.LVT_TIMER;
+
+		lvt.masked = masked;
+		_apic.LVT_TIMER = lvt;
+	} else if (interrupt < 0x100) {
+		struct ioapic_red red;
+		int regidx = 0x10 + 2 * interrupt;
+
+		red.regvals[0] = ioapic_read(regidx);
+		red.regvals[1] = ioapic_read(regidx + 1);
+		red.masked = masked;
+		ioapic_write(regidx, red.regvals[0]);
+		ioapic_write(regidx + 1, red.regvals[1]);
+	}
+}
+
+/* Note: these base pointers live together in a big block.  Eventually
+ * we will probably want one of them for userspace TLS, which means it
+ * will need to be retargetted to point somewhere within the
+ * application memory.  But this is fine for now.
+ */
+static void setup_fg_segs(int cpu)
+{
+	int fi = 3 + 2 * cpu, gi = 3 + 2 * cpu + 1;
+	struct gdt64 *fs = &_shared.gdt[fi];
+	struct gdt64 *gs = &_shared.gdt[gi];
+
+	gdt64_set_base(fs, (long)&_shared.fs_ptrs[cpu]);
+	gdt64_set_base(gs, (long)&_shared.gs_ptrs[cpu]);
+
+	int fsel = GDT_SELECTOR(fi), gsel = GDT_SELECTOR(gi);
+
+	__asm__("mov %0, %%fs; mov %1, %%gs" : : "r"(fsel), "r"(gsel));
+}
+
+static void init_gdt(void)
+{
+	printf("Initializing 64 bit IDT\n");
+
+	/* Need a GDT for ourselves, not whatever the previous layer
+	 * set up.  The scheme is that segment zero is the null
+	 * segment (required and enforced architecturally), segment
+	 * one (selector 8) is the code segment, two (16) is a
+	 * data/stack segment (ignored by code at runtime, but
+	 * required to be present in the L/GDT when executing an
+	 * IRET), and remaining segments come in pairs to provide
+	 * FS/GS segment bases for each CPU.
+	 */
+	_shared.gdt[0] = (struct gdt64) {};
+	_shared.gdt[1] = (struct gdt64) {
+		.readable = 1,
+		.codeseg = 1,
+		.notsystem = 1,
+		.present = 1,
+		.long64 = 1,
+	};
+	_shared.gdt[2] = (struct gdt64) {
+		.readable = 1,
+		.codeseg = 0,
+		.notsystem = 1,
+		.present = 1,
+		.long64 = 1,
+	};
+	for (int i = 3; i < ARRAY_SIZE(_shared.gdt); i++) {
+		_shared.gdt[i] = (struct gdt64) {
+			.readable = 1,
+			.codeseg = 0,
+			.notsystem = 1,
+			.present = 1,
+			.long64 = 1,
+		};
+	}
+}
+
+static void init_idt(void)
+{
+	printf("Initializing 64 bit IDT\n");
+
+	/* Make an IDT in the next unused page and fill in all 256
+	 * entries
+	 */
+	struct idt64 *idt = alloc_page(0);
+
+	_shared.idt_addr = (unsigned int)(long)idt;
+	for (int i = 0; i < 256; i++) {
+		idt[i] = (struct idt64) {
+			.segment = GDT_SELECTOR(1),
+			.type = 14, /* == 64 bit interrupt gate */
+			.present = 1,
+		};
+	}
+
+	/* Hand-encode stubs for each vector that are a simple 5-byte
+	 * CALL instruction to the single handler entry point.  That's
+	 * an opcode of 0xe8 followd by a 4-byte offset from the start
+	 * of the next (!) instruction.  The call is used to push a
+	 * return address on the stack that points into the stub,
+	 * allowing us to extract the vector index by what stub it
+	 * points into.
+	 */
+	struct istub {
+		unsigned char opcode; /* 0xe8 == CALLQ */
+		int off;
+		unsigned char _unused[3];
+	} __attribute__((packed)) *stubtab = alloc_page(0);
+
+	isr_stub_base = (long)stubtab;
+
+	/* FIXME: on x32, the entries in this handlers table are half
+	 * the size as a native 64 bit build, and could be packed into
+	 * the same page as the stubs above, saving the page of low
+	 * memory.
+	 */
+	vector_handlers = alloc_page(1);
+
+	for (int i = 0; i < 256; i++) {
+		struct istub *st = &stubtab[i];
+
+		st->opcode = 0xe8;
+		st->off = choose_isr_entry(i) - (long)st - 5;
+		idt64_set_isr(&idt[i], st);
+	}
+}
+
+static void smp_init(void)
+{
+	/* Generate a GDT for the 16 bit stub to use when
+	 * transitioning to 32 bit protected mode (so the poor thing
+	 * doesn't have to do it itself).  It can live right here on
+	 * our stack.
+	 */
+	struct gdt64 gdt16[] = {
+		{},
+		{
+			.codeseg = 1,
+			.default_size = 1,
+			.readable = 1,
+			.notsystem = 1,
+			.present = 1,
+			.limit_lo16 = 0xffff,
+			.limit_hi4 = 0xf,
+			.page_granularity = 1,
+		},
+		{
+			.readable = 1,
+			.default_size = 1,
+			.notsystem = 1,
+			.present = 1,
+			.limit_lo16 = 0xffff,
+			.limit_hi4 = 0xf,
+			.page_granularity = 1,
+		},
+	};
+	struct {
+		short dummy;
+		short limit;
+		unsigned int addr;
+	} gdtp16 = { .limit = sizeof(gdt16), .addr = (long)&gdt16[0] };
+	_shared.gdt16_addr = (long)&gdtp16.limit;
+
+	/* FIXME: this is only used at startup, and only for a ~150
+	 * byte chunk of code.  Find a way to return it, or maybe put
+	 * it in the low memory null guard instead?
+	 */
+	char *sipi_page = alloc_page(1);
+
+	int s16bytes = &_xuk_stub16_end - &_xuk_stub16_start;
+
+	printf("Copying %d bytes of 16 bit code into page %p\n",
+	       s16bytes, (int)(long)sipi_page);
+	for (int i = 0; i < s16bytes; i++) {
+		sipi_page[i] = ((char *)&_xuk_stub16_start)[i];
+	}
+
+	/* First send an INIT interrupt to all CPUs.  This resets them
+	 * regardless of whatever they were doing and they enter a
+	 * "wait for SIPI" state
+	 */
+	printf("Sending INIT IPI\n");
+	_apic.ICR_LO = (struct apic_icr_lo) {
+		.delivery_mode = INIT,
+		.shorthand = NOTSELF,
+	};
+	while (_apic.ICR_LO.send_pending) {
+	}
+
+	/* Now send the startup IPI (SIPI) to all CPUs.  They will
+	 * begin executing in real mode with IP=0 and CS pointing to
+	 * the page we allocated.
+	 */
+	_shared.smpinit_lock = 0;
+	_shared.smpinit_stack = 0;
+	_shared.num_active_cpus = 1;
+
+	printf("Sending SIPI IPI\n");
+	_apic.ICR_LO = (struct apic_icr_lo) {
+		.delivery_mode = STARTUP,
+		.shorthand = NOTSELF,
+		.vector = ((long)sipi_page) >> 12,
+	};
+	while (_apic.ICR_LO.send_pending) {
+	}
+
+	for (int i = 1; i < CONFIG_MP_NUM_CPUS; i++) {
+		_shared.smpinit_stack = _init_cpu_stack(i);
+		printf("Granting stack @ %xh to CPU %d\n",
+		       _shared.smpinit_stack, i);
+
+		while (_shared.num_active_cpus <= i) {
+			__asm__("pause");
+		}
+	}
+}
+
+void _cstart64(int cpu_id)
+{
+	if (cpu_id == 0) {
+		extern char __bss_start, __bss_end;
+
+		__builtin_memset(&__bss_start, 0, &__bss_end - &__bss_start);
+	}
+
+#ifdef CONFIG_XUK_DEBUG
+	_putchar = putchar;
+#endif
+	printf("\n==\nHello from 64 bit C code on CPU%d (stack ~%xh)\n",
+	       cpu_id, (int)(long)&cpu_id);
+	printf("sizeof(int) = %d, sizeof(long) = %d, sizeof(void*) = %d\n",
+	       sizeof(int), sizeof(long), sizeof(void *));
+
+	if (cpu_id == 0) {
+		init_gdt();
+	}
+
+	struct {
+		unsigned short dummy[3];
+		unsigned short limit;
+		unsigned long long addr;
+	} gdtp = { .limit = sizeof(_shared.gdt), .addr = (long)_shared.gdt };
+
+	printf("Loading 64 bit GDT\n");
+	__asm__ volatile("lgdt %0" : : "m"(gdtp.limit));
+
+	/* Need to actually set the data & stack segments with those
+	 * indexes.  Whatever we have in those hidden registers works
+	 * for data access *now*, but the next interrupt will push
+	 * whatever the selector index was, and we need to know that
+	 * our table contains the same layout!
+	 */
+	int selector = GDT_SELECTOR(2);
+
+	__asm__ volatile("mov %0, %%ds; mov %0, %%ss" : : "r"(selector));
+
+	if (cpu_id == 0) {
+		init_idt();
+	}
+
+	struct {
+		unsigned short dummy[3];
+		unsigned short limit;
+		unsigned long long addr;
+	} idtp = { .limit = 4096, .addr = _shared.idt_addr };
+
+	printf("Loading IDT lim %d addr %xh\n", idtp.limit, idtp.addr);
+	__asm__ volatile("lidt %0" : : "m"(idtp.limit));
+
+	/* Classic PC architecture gotcha: disable 8259 PICs before
+	 * they fires a timer interrupt into our exception table.
+	 * Write 1's into the interrupt masks.
+	 */
+	if (cpu_id == 0) {
+		printf("Disabling 8259 PICs\n");
+		ioport_out8(0xa1, 0xff); /* slave  */
+		ioport_out8(0x21, 0xff); /* master */
+	}
+
+	/* Enable APIC.  Set both the MSR bit and the "software
+	 * enable" bit in the spurious interrupt vector register.
+	 */
+	const unsigned int IA32_APIC_BASE = 0x1b;
+
+	printf("Enabling APIC id %xh ver %xh\n", _apic.ID, _apic.VER);
+	set_msr_bit(IA32_APIC_BASE, 11);
+	_apic.SPURIOUS |= 1<<8;
+	_apic.LDR = cpu_id << 24;
+	_apic.DIVIDE_CONF = APIC_DIVISOR(CONFIG_XUK_APIC_TSC_SHIFT);
+
+	printf("Initializing FS/GS segments for local CPU%d\n", cpu_id);
+	setup_fg_segs(cpu_id);
+
+	if (cpu_id == 0) {
+		printf("Brining up auxiliary CPUs...\n");
+		smp_init();
+	}
+
+	printf("Calling _cpu_start on CPU %d\n", cpu_id);
+	_cpu_start(cpu_id);
+}
+
+long xuk_setup_stack(long sp, void *fn, unsigned int eflags,
+		     long *args, int nargs)
+{
+	long long *f = (long long *)(sp & ~7) - 20;
+
+	/* FIXME: this should extend naturally to setting up usermode
+	 * frames too: the frame should have a SS and RSP at the top
+	 * that specifies the user stack into which to return (can be
+	 * this same stack as long as the mapping is correct), and the
+	 * CS should be a separate ring 3 segment.
+	 */
+
+	f[19] = GDT_SELECTOR(2);
+	f[18] = sp;
+	f[17] = eflags;
+	f[16] = GDT_SELECTOR(1);
+	f[15] = (long)fn;
+	f[14] = nargs >= 1 ? args[0] : 0; /* RDI */
+	f[13] = nargs >= 3 ? args[2] : 0; /* RDX */
+	f[12] = 0;                        /* RAX */
+	f[11] = nargs >= 4 ? args[3] : 0; /* RCX */
+	f[10] = nargs >= 2 ? args[1] : 0; /* RSI */
+	f[9]  = nargs >= 5 ? args[4] : 0; /* R8 */
+	f[8]  = nargs >= 6 ? args[5] : 0; /* R9 */
+
+	/* R10, R11, RBX, RBP, R12, R13, R14, R15 */
+	for (int i = 7; i >= 0; i--) {
+		f[i] = 0;
+	}
+
+	return (long)f;
+}
+
+int z_arch_printk_char_out(int c)
+{
+	putchar(c);
+	return 0;
+}
diff --git a/arch/x86_64/core/xuk.h b/arch/x86_64/core/xuk.h
new file mode 100644
index 00000000000..9d21b9ce054
--- /dev/null
+++ b/arch/x86_64/core/xuk.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _XUK_H
+#define _XUK_H
+
+#include <xuk-switch.h>
+#include "shared-page.h"
+
+/*
+ * APIs exposed by the xuk layer to OS integration:
+ */
+
+/* Set a single CPU-specific pointer which can be retrieved (on that
+ * CPU!) with get_f_ptr()
+ */
+static inline void xuk_set_f_ptr(int cpu, void *p)
+{
+	_shared.fs_ptrs[cpu] = (long)p;
+}
+
+/* Likewise, but "G" */
+static inline void xuk_set_g_ptr(int cpu, void *p)
+{
+	_shared.gs_ptrs[cpu] = (long)p;
+}
+
+/* Retrieves the pointer set by set_f_ptr() for the current CPU */
+static inline void *xuk_get_f_ptr()
+{
+	long long ret, off = 0;
+
+	__asm__("movq %%fs:(%1), %0" : "=r"(ret) : "r"(off));
+	return (void *)(long)ret;
+}
+
+/* Retrieves the pointer set by set_g_ptr() for the current CPU */
+static inline void *xuk_get_g_ptr()
+{
+	long long ret, off = 0;
+
+	__asm__("movq %%gs:(%1), %0" : "=r"(ret) : "r"(off));
+	return (void *)(long)ret;
+}
+
+/**
+ * @brief Sets a global handler for the specified interrupt.
+ *
+ * Interrupt numbers live in a partitioned space:
+ *
+ * + Values from 0 - 0xff are mapped to INTIx interrupts in the global
+ *   index of IO-APIC inputs, which on many systems correspond to
+ *   legacy IRQ0-IRQ15 interrupts at the bottom of the interrupt
+ *   range.  These handlers are not passed a meaningful value in their
+ *   first argument, though the function pointer type declares one.
+ *
+ * + Values from 0x100 to 0x1ff are mapped to raw vectors 0x00-0xff
+ *   and can be used for handling exceptions, for INT instructions, or
+ *   for MSI- or IPI-directed interrupts that specifiy specific
+ *   vectors.
+ *
+ * + Values outside this range may be exposed symbolically for other
+ *   interrupts sources, for example local APIC LVT interrupts.
+ *
+ * If there is a pre-existing handler specified for a specified raw
+ * vector, this function will replace it.
+ *
+ * @param interrupt Interrupt number.  See above for interpretation.
+ * @param priority Integer in the range 2-15. Higher-valued interrupts
+ *                 can interrupt lower ones.  Ignored for raw vector
+ *                 numbers, as their priority is encoded in the top
+ *                 four bits of the vector number.  A priority of zero
+ *                 is treated as "don't care" and the interrupt will
+ *                 be assigned the lowest available vector.
+ * @param handler Function pointer to invoke on interrupt receipt.  It
+ *                 will be passed the specified argument as the first
+ *                 argument and the x86 exception error code (if any)
+ *                 in the second.
+ * @param arg Opaque value to pass to the handler when invoked.
+ *
+ */
+void xuk_set_isr(int interrupt, int priority,
+		 void (*handler)(void *, int), void *arg);
+
+#define INT_APIC_LVT_TIMER 0x200
+
+#define XUK_INT_RAW_VECTOR(vector) ((vector)+0x100)
+
+void xuk_set_isr_mask(int interrupt, int masked);
+
+/* Stack frame on interrupt entry.  Obviously they get pushed onto the
+ * stack in the opposite order than they appear here; the last few
+ * entries are the hardware frame.  Note that not all registers are
+ * present, the ABI caller-save registers don't get pushed until after
+ * the handler as an optimization.
+ */
+struct xuk_entry_frame {
+	unsigned long long r11;
+	unsigned long long r10;
+	unsigned long long r9;
+	unsigned long long r8;
+	unsigned long long rsi;
+	unsigned long long rcx;
+	unsigned long long rax;
+	unsigned long long rdx;
+	unsigned long long rdi;
+	unsigned long long rip;
+	unsigned long long cs;
+	unsigned long long rflags;
+	unsigned long long rsp;
+	unsigned long long ss;
+};
+
+/* Full stack frame, i.e. the one used as the handles in xuk_switch().
+ * Once more, the registers declared here are NOT POPULATED during the
+ * execution of an interrupt service routine.
+ */
+struct xuk_stack_frame {
+	unsigned long long r15;
+	unsigned long long r14;
+	unsigned long long r13;
+	unsigned long long r12;
+	unsigned long long rbp;
+	unsigned long long rbx;
+	struct xuk_entry_frame entry;
+};
+
+/* Sets up a new stack. The sp argument should point to the quadword
+ * above (!) the allocated stack area (i.e. the frame will be pushed
+ * below it).  The frame will be set up to enter the function in the
+ * specified code segment with the specified flags register.  An array
+ * of up to 6 function arguments may also be provided.  Returns a
+ * handle suitable for passing to switch() or for returning from
+ * isr_exit_restore_stack().
+ */
+long xuk_setup_stack(long sp, void *fn, unsigned int eflags,
+		     long *args, int nargs);
+
+/*
+ * OS-defined utilities required by the xuk layer:
+ */
+
+/* Returns the address of a stack pointer in 32 bit memory to be used
+ * by AP processor bootstraping and startup.
+ */
+unsigned int _init_cpu_stack(int cpu);
+
+/* OS CPU startup entry point, running on the stack returned by
+ * init_cpu_stack()
+ */
+void _cpu_start(int cpu);
+
+/* Called on receipt of an unregistered interrupt/exception.  Passes
+ * the vector number and the CPU error code, if any.
+ */
+void _unhandled_vector(int vector, int err, struct xuk_entry_frame *f);
+
+/* Called on ISR entry before nested interrupts are enabled so the OS
+ * can arrange bookeeping.  Really should be exposed as an inline and
+ * not a function call; cycles on interrupt entry are precious.
+ */
+void _isr_entry(void);
+
+/* Called on ISR exit to choose a next thread to run.  The argument is
+ * a context pointer to the thread that was interrupted.
+ */
+void *_isr_exit_restore_stack(void *interrupted);
+
+#endif /* _XUK_H */
diff --git a/arch/x86_64/core/xuk64.ld b/arch/x86_64/core/xuk64.ld
new file mode 100644
index 00000000000..235b576e648
--- /dev/null
+++ b/arch/x86_64/core/xuk64.ld
@@ -0,0 +1,15 @@
+SECTIONS {
+	. = 0x100000;
+
+	.text : {
+	      *(.xuk_stub32)
+	      . = ALIGN(16);
+	      *(.xuk_start64*)
+	      *(.text*)
+	}
+	.rodata : { *(.rodata*) }
+	.data : { *(.data*) }
+	__bss_start = .;
+	.bss : { *(.bss*) *(COMMON) }
+	__bss_end = .;
+}
diff --git a/arch/x86_64/include/kernel_arch_data.h b/arch/x86_64/include/kernel_arch_data.h
new file mode 100644
index 00000000000..827bb3cd08d
--- /dev/null
+++ b/arch/x86_64/include/kernel_arch_data.h
@@ -0,0 +1,11 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _KERNEL_ARCH_DATA_H
+#define _KERNEL_ARCH_DATA_H
+
+struct _kernel_arch { };
+
+#endif /* _KERNEL_ARCH_DATA_H */
diff --git a/arch/x86_64/include/kernel_arch_func.h b/arch/x86_64/include/kernel_arch_func.h
new file mode 100644
index 00000000000..0268da3498b
--- /dev/null
+++ b/arch/x86_64/include/kernel_arch_func.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _KERNEL_ARCH_FUNC_H
+#define _KERNEL_ARCH_FUNC_H
+
+#include <irq.h>
+#include <xuk-switch.h>
+
+static inline void kernel_arch_init(void)
+{
+	/* This is a noop, we already took care of things before
+	 * _Cstart() is entered
+	 */
+}
+
+static inline struct _cpu *_arch_curr_cpu(void)
+{
+	long long ret, off = 0;
+
+	/* The struct _cpu pointer for the current CPU lives at the
+	 * start of the the FS segment
+	 */
+	__asm__("movq %%fs:(%1), %0" : "=r"(ret) : "r"(off));
+	return (struct _cpu *)(long)ret;
+}
+
+static inline unsigned int _arch_irq_lock(void)
+{
+	unsigned long long key;
+
+	__asm__ volatile("pushfq; cli; popq %0" : "=r"(key));
+	return (int)key;
+}
+
+static inline void _arch_irq_unlock(unsigned int key)
+{
+	if (key & 0x200) {
+		__asm__ volatile("sti");
+	}
+}
+
+static inline void arch_nop(void)
+{
+	__asm__ volatile("nop");
+}
+
+void _arch_irq_disable(unsigned int irq);
+void _arch_irq_enable(unsigned int irq);
+
+/* Not a standard Zephyr function, but probably will be */
+static inline unsigned long long _arch_k_cycle_get_64(void)
+{
+	unsigned int hi, lo;
+
+	__asm__ volatile("rdtsc" : "=d"(hi), "=a"(lo));
+	return (((unsigned long long)hi) << 32) | lo;
+}
+
+static inline unsigned int _arch_k_cycle_get_32(void)
+{
+#ifdef CONFIG_HPET_TIMER
+	extern u32_t _timer_cycle_get_32(void);
+	return _timer_cycle_get_32();
+#else
+	return (u32_t)_arch_k_cycle_get_64();
+#endif
+}
+
+#define _is_in_isr() (_arch_curr_cpu()->nested != 0)
+
+static inline void _arch_switch(void *switch_to, void **switched_from)
+{
+	xuk_switch(switch_to, switched_from);
+}
+
+static inline u32_t x86_apic_scaled_tsc(void)
+{
+	u32_t lo, hi;
+	u64_t tsc;
+
+	__asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi));
+	tsc = (((u64_t)hi) << 32) | lo;
+	return (u32_t)(tsc >> CONFIG_XUK_APIC_TSC_SHIFT);
+}
+
+void x86_apic_set_timeout(u32_t cyc_from_now);
+
+#define _ARCH_IRQ_CONNECT(irq, pri, isr, arg, flags) \
+	_arch_irq_connect_dynamic(irq, pri, isr, arg, flags)
+
+extern int x86_64_except_reason;
+
+
+/* Vector 5 is the "bounds" exception which is otherwise vestigial
+ * (BOUND is an illegal instruction in long mode)
+ */
+#define _ARCH_EXCEPT(reason) do {		\
+		x86_64_except_reason = reason;	\
+		__asm__ volatile("int $5");	\
+	} while (false)
+
+#endif /* _KERNEL_ARCH_FUNC_H */
diff --git a/arch/x86_64/include/kernel_arch_thread.h b/arch/x86_64/include/kernel_arch_thread.h
new file mode 100644
index 00000000000..78fb5ee900f
--- /dev/null
+++ b/arch/x86_64/include/kernel_arch_thread.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _KERNEL_ARCH_THREAD_H
+#define _KERNEL_ARCH_THREAD_H
+
+/* Vestigial boilerplate.  This must exist to it can be included in
+ * kernel.h to define these structs to provide types for fields in the
+ * Zephyr thread struct.  But we don't need that for this arch.
+ */
+
+struct _caller_saved { };
+struct _callee_saved { };
+struct _thread_arch { };
+
+#endif /* _KERNEL_ARCH_THREAD_H */
diff --git a/arch/x86_64/include/offsets_short_arch.h b/arch/x86_64/include/offsets_short_arch.h
new file mode 100644
index 00000000000..d5921dc50c1
--- /dev/null
+++ b/arch/x86_64/include/offsets_short_arch.h
@@ -0,0 +1,5 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
diff --git a/arch/x86_64/include/xuk-switch.h b/arch/x86_64/include/xuk-switch.h
new file mode 100644
index 00000000000..8c3fe9d1ca1
--- /dev/null
+++ b/arch/x86_64/include/xuk-switch.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _XUK_SWITCH_H
+#define _XUK_SWITCH_H
+
+/* This lives separate from the rest of the xuk API, as it has
+ * to be inlined into Zephyr code.
+ */
+
+static inline void xuk_switch(void *switch_to, void **switched_from)
+{
+	/* Constructs an IRETQ interrupt frame, the final CALL pushes
+	 * the RIP to which to return
+	 */
+	__asm__ volatile("mov %%rsp, %%rcx;"
+			 "pushq $0x10;"      /* SS */
+			 "pushq %%rcx;"      /* RSP */
+			 "pushfq;"           /* RFLAGS */
+			 "pushq $0x08;"      /* CS */
+			 "callq _switch_top"
+			 : : "a"(switch_to), "d"(switched_from)
+			 : "ecx", "memory");
+}
+
+#endif /* _XUK_SWITCH_H */
diff --git a/boards/x86_64/qemu_x86_64/Kconfig.board b/boards/x86_64/qemu_x86_64/Kconfig.board
new file mode 100644
index 00000000000..fa74dacff9a
--- /dev/null
+++ b/boards/x86_64/qemu_x86_64/Kconfig.board
@@ -0,0 +1,4 @@
+config BOARD_QEMU_X86_64
+	bool "QEMU x86_64"
+	depends on SOC_X86_64
+	select QEMU_TARGET
diff --git a/boards/x86_64/qemu_x86_64/Kconfig.defconfig b/boards/x86_64/qemu_x86_64/Kconfig.defconfig
new file mode 100644
index 00000000000..172733baf74
--- /dev/null
+++ b/boards/x86_64/qemu_x86_64/Kconfig.defconfig
@@ -0,0 +1,9 @@
+if BOARD_QEMU_X86_64
+
+config BUILD_OUTPUT_BIN
+	default n
+
+config BOARD
+	default "qemu_x86_64"
+
+endif # BOARD_QEMU_X86_64
diff --git a/boards/x86_64/qemu_x86_64/board.cmake b/boards/x86_64/qemu_x86_64/board.cmake
new file mode 100644
index 00000000000..a33665082e1
--- /dev/null
+++ b/boards/x86_64/qemu_x86_64/board.cmake
@@ -0,0 +1,2 @@
+set(EMU_PLATFORM qemu)
+set(QEMU_FLAGS_${ARCH} -nographic)
diff --git a/boards/x86_64/qemu_x86_64/board.h b/boards/x86_64/qemu_x86_64/board.h
new file mode 100644
index 00000000000..d5921dc50c1
--- /dev/null
+++ b/boards/x86_64/qemu_x86_64/board.h
@@ -0,0 +1,5 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
diff --git a/boards/x86_64/qemu_x86_64/qemu_x86_64.yaml b/boards/x86_64/qemu_x86_64/qemu_x86_64.yaml
new file mode 100644
index 00000000000..07cc94d47e6
--- /dev/null
+++ b/boards/x86_64/qemu_x86_64/qemu_x86_64.yaml
@@ -0,0 +1,9 @@
+identifier: qemu_x86_64
+name: QEMU Emulation for X86_64
+type: qemu
+simulation: qemu
+arch: x86_64
+toolchain:
+  - zephyr
+testing:
+  default: true
diff --git a/boards/x86_64/qemu_x86_64/qemu_x86_64_defconfig b/boards/x86_64/qemu_x86_64/qemu_x86_64_defconfig
new file mode 100644
index 00000000000..5b7b548e22a
--- /dev/null
+++ b/boards/x86_64/qemu_x86_64/qemu_x86_64_defconfig
@@ -0,0 +1,12 @@
+CONFIG_X86_64=y
+CONFIG_SOC_X86_64=y
+CONFIG_BOARD_QEMU_X86_64=y
+CONFIG_CONSOLE=y
+CONFIG_SERIAL=y
+CONFIG_TEST_RANDOM_GENERATOR=y
+CONFIG_XIP=y
+CONFIG_HPET_TIMER=y
+CONFIG_SYS_CLOCK_HW_CYCLES_PER_SEC=1000000000
+CONFIG_MAIN_STACK_SIZE=1024
+CONFIG_IDLE_STACK_SIZE=1024
+CONFIG_TEST_EXTRA_STACKSIZE=2048
diff --git a/cmake/emu/qemu.cmake b/cmake/emu/qemu.cmake
index b6ebad3ede3..7eabe9f61a2 100644
--- a/cmake/emu/qemu.cmake
+++ b/cmake/emu/qemu.cmake
@@ -219,6 +219,10 @@ if(CONFIG_X86_IAMCU)
     )
 endif()
 
+if(CONFIG_X86_64)
+  set(QEMU_KERNEL_FILE "${CMAKE_BINARY_DIR}/zephyr-qemu.elf")
+endif()
+
 if(NOT QEMU_PIPE)
   set(QEMU_PIPE_COMMENT "\nTo exit from QEMU enter: 'CTRL+a, x'\n")
 endif()
@@ -230,9 +234,15 @@ list(APPEND QEMU_EXTRA_FLAGS ${env_qemu})
 
 list(APPEND MORE_FLAGS_FOR_debugserver -s -S)
 
-set_ifndef(QEMU_KERNEL_OPTION
-  "-kernel;$<TARGET_FILE:${logical_target_for_zephyr_elf}>"
-  )
+# Architectures can define QEMU_KERNEL_FILE to use a specific output
+# file to pass to qemu (and a "qemu_kernel_target" target to generate
+# it), or set QEMU_KERNEL_OPTION if they want to replace the "-kernel
+# ..." option entirely.
+if(DEFINED QEMU_KERNEL_FILE)
+  set(QEMU_KERNEL_OPTION "-kernel;${QEMU_KERNEL_FILE}")
+elseif(NOT DEFINED QEMU_KERNEL_OPTION)
+  set(QEMU_KERNEL_OPTION "-kernel;$<TARGET_FILE:${logical_target_for_zephyr_elf}>")
+endif()
 
 foreach(target ${qemu_targets})
   add_custom_target(${target}
@@ -250,4 +260,7 @@ foreach(target ${qemu_targets})
     COMMENT "${QEMU_PIPE_COMMENT}[QEMU] CPU: ${QEMU_CPU_TYPE_${ARCH}}"
     USES_TERMINAL
     )
+  if(DEFINED QEMU_KERNEL_FILE)
+    add_dependencies(${target} qemu_kernel_target)
+  endif()
 endforeach()
diff --git a/drivers/timer/Kconfig b/drivers/timer/Kconfig
index 7a01097cc24..a29cc50bd7e 100644
--- a/drivers/timer/Kconfig
+++ b/drivers/timer/Kconfig
@@ -12,9 +12,9 @@ menu "Timer Drivers"
 
 config HPET_TIMER
 	bool "HPET timer"
-	depends on X86
-	select IOAPIC
-	select LOAPIC
+	depends on (X86 || X86_64)
+	select IOAPIC if X86
+	select LOAPIC if X86
 	select TIMER_READS_ITS_FREQUENCY_AT_RUNTIME
 	select TICKLESS_CAPABLE
 	help
diff --git a/include/arch/cpu.h b/include/arch/cpu.h
index 5c12d9c1d34..7b6320ac047 100644
--- a/include/arch/cpu.h
+++ b/include/arch/cpu.h
@@ -11,6 +11,8 @@
 
 #if defined(CONFIG_X86)
 #include <arch/x86/arch.h>
+#elif defined(CONFIG_X86_64)
+#include <arch/x86_64/arch.h>
 #elif defined(CONFIG_ARM)
 #include <arch/arm/arch.h>
 #elif defined(CONFIG_ARC)
diff --git a/include/arch/x86_64/arch.h b/include/arch/x86_64/arch.h
new file mode 100644
index 00000000000..f3b52ae291c
--- /dev/null
+++ b/include/arch/x86_64/arch.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _X86_64_ARCH_H
+#define _X86_64_ARCH_H
+
+#include <kernel_arch_func.h>
+#include <arch/bits_portable.h>
+
+#define STACK_ALIGN 8
+
+typedef struct NANO_ESF NANO_ESF;
+extern const NANO_ESF _default_esf;
+void _SysFatalErrorHandler(unsigned int reason, const NANO_ESF *esf);
+void _NanoFatalErrorHandler(unsigned int reason, const NANO_ESF *esf);
+
+/* Existing code requires only these particular symbols be defined,
+ * but doesn't put them in a global header.  Needs cleaner
+ * cross-architecture standardization.  Implement only the minimal set
+ * here.
+ */
+#define _NANO_ERR_STACK_CHK_FAIL 1
+#define _NANO_ERR_KERNEL_OOPS    2
+#define _NANO_ERR_KERNEL_PANIC   3
+
+#endif /* _X86_64_ARCH_H */
diff --git a/include/device.h b/include/device.h
index fac01a2354b..0032fb13126 100644
--- a/include/device.h
+++ b/include/device.h
@@ -237,6 +237,14 @@ struct device {
 	struct device_config *config;
 	const void *driver_api;
 	void *driver_data;
+#if defined(__x86_64) && __SIZEOF_POINTER__ == 4
+	/* The x32 ABI hits an edge case.  This is a 12 byte struct,
+	 * but the x86_64 linker will pack them only in units of 8
+	 * bytes, leading to alignment problems when iterating over
+	 * the link-time array.
+	 */
+	void *padding;
+#endif
 };
 
 void _sys_device_do_config_level(s32_t level);
diff --git a/include/toolchain/common.h b/include/toolchain/common.h
index 5d0d4146a61..49b80b43d7c 100644
--- a/include/toolchain/common.h
+++ b/include/toolchain/common.h
@@ -54,7 +54,7 @@
 
 #ifdef _ASMLANGUAGE
 
-  #ifdef CONFIG_X86
+  #if defined(CONFIG_X86) || defined(CONFIG_X86_64)
 
     #ifdef PERF_OPT
       #define PERFOPT_ALIGN .balign 16
diff --git a/include/toolchain/gcc.h b/include/toolchain/gcc.h
index f800332e4e4..5eb27745480 100644
--- a/include/toolchain/gcc.h
+++ b/include/toolchain/gcc.h
@@ -347,6 +347,13 @@ A##a:
 		",%c0"                              \
 		"\n\t.type\t" #name ",@object" :  : "n"(value))
 
+#elif defined(CONFIG_X86_64)
+
+#define GEN_ABSOLUTE_SYM(name, value)               \
+	__asm__(".globl\t" #name "\n\t.equ\t" #name \
+		",%0"                               \
+		"\n\t.type\t" #name ",@object" :  : "n"(value))
+
 #elif defined(CONFIG_NIOS2) || defined(CONFIG_RISCV32) || defined(CONFIG_XTENSA)
 
 /* No special prefixes necessary for constants in this arch AFAICT */
diff --git a/lib/libc/minimal/include/sys/types.h b/lib/libc/minimal/include/sys/types.h
index 846dd313577..55a62116e21 100644
--- a/lib/libc/minimal/include/sys/types.h
+++ b/lib/libc/minimal/include/sys/types.h
@@ -20,8 +20,8 @@ typedef __SIZE_TYPE__ ssize_t;
 #if !defined(__off_t_defined)
 #define __off_t_defined
 
-#ifdef __i386
-typedef long int off_t;
+#if defined(__i386) || defined(__x86_64)
+typedef long int off_t; /* "long" works for all of i386, X32 and true 64 bit */
 #elif defined(__ARM_ARCH)
 typedef int off_t;
 #elif defined(__arc__)
diff --git a/lib/libc/minimal/source/stdout/prf.c b/lib/libc/minimal/source/stdout/prf.c
index bf7410d87f7..157634b315b 100644
--- a/lib/libc/minimal/source/stdout/prf.c
+++ b/lib/libc/minimal/source/stdout/prf.c
@@ -595,6 +595,14 @@ int _prf(int (*func)(), void *dest, char *format, va_list vargs)
 			case 'G':
 				/* standard platforms which supports double */
 			{
+#ifdef CONFIG_X86_64
+				/* Can't use a double here because
+				 * we're operating in -mno-sse and
+				 * va_arg() will expect this to be a
+				 * register argument.
+				 */
+				double_temp = va_arg(vargs, uint64_t);
+#else
 				union {
 					double d;
 					uint64_t i;
@@ -602,6 +610,7 @@ int _prf(int (*func)(), void *dest, char *format, va_list vargs)
 
 				u.d = (double) va_arg(vargs, double);
 				double_temp = u.i;
+#endif
 			}
 
 				c = _to_float(buf, double_temp, c, falt, fplus,
diff --git a/samples/application_development/external_lib/sample.yaml b/samples/application_development/external_lib/sample.yaml
index 57033d94c3a..21d6a854b6b 100644
--- a/samples/application_development/external_lib/sample.yaml
+++ b/samples/application_development/external_lib/sample.yaml
@@ -3,6 +3,7 @@ sample:
 tests:
   test:
     tags: external
+    platform_exclude: qemu_x86_64
     harness: console
     harness_config:
       type: multi_line
diff --git a/samples/drivers/crypto/sample.yaml b/samples/drivers/crypto/sample.yaml
index 93905f53f94..25b7c194728 100644
--- a/samples/drivers/crypto/sample.yaml
+++ b/samples/drivers/crypto/sample.yaml
@@ -6,7 +6,7 @@ common:
   tags: crypto
   harness: console
   min_ram: 20
-  arch_exclude: xtensa
+  arch_exclude: xtensa x86_64
 tests:
   test-mbedtls:
     min_flash: 34
diff --git a/samples/sensor/thermometer/sample.yaml b/samples/sensor/thermometer/sample.yaml
index a21d6d9a30b..c509fe8f13c 100644
--- a/samples/sensor/thermometer/sample.yaml
+++ b/samples/sensor/thermometer/sample.yaml
@@ -4,3 +4,4 @@ tests:
   test:
     tags: sensors
     harness: sensor
+    arch_exclude: x86_64   # No floating point on x86_64 yet
\ No newline at end of file
diff --git a/samples/subsys/logging/logger/sample.yaml b/samples/subsys/logging/logger/sample.yaml
index 96952fd816c..27b0154866f 100644
--- a/samples/subsys/logging/logger/sample.yaml
+++ b/samples/subsys/logging/logger/sample.yaml
@@ -5,7 +5,7 @@ sample:
 tests:
   samples.logger:
     tags: logging
-    platform_exclude: qemu_xtensa
+    platform_exclude: qemu_xtensa qemu_x86_64
     harness: console
     harness_config:
       type: one_line
diff --git a/samples/synchronization/sample.yaml b/samples/synchronization/sample.yaml
index bc83331079d..b2415e7f5bf 100644
--- a/samples/synchronization/sample.yaml
+++ b/samples/synchronization/sample.yaml
@@ -16,7 +16,7 @@ tests:
     tags: kernel synchronization
     extra_configs:
       - CONFIG_OPENOCD_SUPPORT=y
-    arch_exclude: posix xtensa
+    arch_exclude: posix xtensa x86_64
     harness: console
     harness_config:
       type: multi_line
diff --git a/soc/x86_64/x86_64/CMakeLists.txt b/soc/x86_64/x86_64/CMakeLists.txt
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/soc/x86_64/x86_64/Kconfig.defconfig b/soc/x86_64/x86_64/Kconfig.defconfig
new file mode 100644
index 00000000000..5a9693d0faa
--- /dev/null
+++ b/soc/x86_64/x86_64/Kconfig.defconfig
@@ -0,0 +1,9 @@
+if SOC_X86_64
+
+config SOC
+	default "x86_64"
+
+config USE_SWITCH
+	default y
+
+endif
diff --git a/soc/x86_64/x86_64/Kconfig.soc b/soc/x86_64/x86_64/Kconfig.soc
new file mode 100644
index 00000000000..3c52450b2cb
--- /dev/null
+++ b/soc/x86_64/x86_64/Kconfig.soc
@@ -0,0 +1,2 @@
+config SOC_X86_64
+	bool "Generic x86_64 PC"
diff --git a/soc/x86_64/x86_64/linker.ld b/soc/x86_64/x86_64/linker.ld
new file mode 100644
index 00000000000..871c5dc44d7
--- /dev/null
+++ b/soc/x86_64/x86_64/linker.ld
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <autoconf.h> /* Seems not to get picked up automatically? */
+
+#define _LINKER
+#define _ASMLANGUAGE
+#include <linker/linker-defs.h>
+
+/* The common-ram.ld definitions are written to a sort of oddball
+ * preprocessor API which we reimplement here.  The default implementation
+ * is incompatible with the simpler way we handle address assignment
+ * and ELF segment definitions
+ */
+#define SORT_BY_NAME(x) SORT(x)
+#define OPTIONAL
+#define SECTION_DATA_PROLOGUE(name, opts, align) name opts : align
+#define SECTION_PROLOGUE(name, opts, align)      name opts : align
+#define GROUP_DATA_LINK_IN(v, l) :ram
+#define GROUP_LINK_IN(a)         :ram
+
+PHDRS {
+      ram PT_LOAD;
+}
+
+_start = 0x100000;
+ENTRY(_start);
+
+SECTIONS {
+	. = 0x100000;
+
+	text : {
+		KEEP(*(.xuk_stub32*))
+		. = ALIGN(16);
+		KEEP(*(.xuk_start64*))
+
+		*(.text_start*)
+		*(.text*)
+		*(.gnu.linkonce.t.*)
+		*(.eh_frame)
+		*(.init)
+		*(.fini)
+		*(.eini)
+		KEEP(*(.openocd_dbg*))
+	} :ram
+
+#include <linker/common-rom.ld>
+
+	rodata : {
+		*(.rodata*)
+		*(.gnu.linkonce.r.*)
+	} :ram
+
+	datas : {
+		*(.data*)
+	} :ram
+
+#include <linker/common-ram.ld>
+
+	__bss_start = .;
+	bss (NOLOAD) : {
+		*(COMMON)
+		*(.bss*)
+	} :ram
+	__bss_end = .;
+
+	noinit (NOLOAD) : {
+		*(.noinit*)
+	} :ram
+
+
+	/* We enable orphan section warnings, so these standard sections all
+	 * have to be called out explicitly
+	 */
+	.comment 0 : { *(.comment) }
+	.debug_frame 0 : { *(.debug_frame) }
+	.debug_info 0 : { *(.debug_info) }
+	.debug_abbrev 0 : { *(.debug_abbrev) }
+	.debug_aranges 0 : { *(.debug_aranges) }
+	.debug_ranges 0 : { *(.debug_ranges) }
+	.debug_line 0 : { *(.debug_line) }
+	.debug_str 0 : { *(.debug_str) }
+	.debug_loc 0 : { *(.debug_loc) }
+	.note.GNU-stack 0 : { *(.note.GNU-stack) }
+	.picjunk 0 : { *(.got*) *(.*plt*) *(.rela.*) }
+}
diff --git a/soc/x86_64/x86_64/soc.h b/soc/x86_64/x86_64/soc.h
new file mode 100644
index 00000000000..94a09d408a6
--- /dev/null
+++ b/soc/x86_64/x86_64/soc.h
@@ -0,0 +1,8 @@
+/*
+ * Copyright (c) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* Empty file.  There is no standard API to be defined here, yet some
+ * test code includes it.
+ */
diff --git a/tests/benchmarks/sys_kernel/testcase.yaml b/tests/benchmarks/sys_kernel/testcase.yaml
index 37072d36107..36d7869216f 100644
--- a/tests/benchmarks/sys_kernel/testcase.yaml
+++ b/tests/benchmarks/sys_kernel/testcase.yaml
@@ -1,5 +1,5 @@
 tests:
   benchmark.kernel:
-    arch_exclude: nios2 riscv32 xtensa
+    arch_exclude: nios2 riscv32 xtensa x86_64
     min_ram: 32
     tags: benchmark
diff --git a/tests/include/test_asm_inline_gcc.h b/tests/include/test_asm_inline_gcc.h
index 340c3fa14eb..521b58e4a67 100644
--- a/tests/include/test_asm_inline_gcc.h
+++ b/tests/include/test_asm_inline_gcc.h
@@ -23,6 +23,12 @@ static inline void timestamp_serialize(void)
 	:
 	: "%eax", "%ebx", "%ecx", "%edx");
 }
+#elif defined(CONFIG_X86_64)
+static inline void timestamp_serialize(void)
+{
+	__asm__ volatile("xorq %%rax,%%rax; cpuid"
+			 ::: "rax", "rdx", "rbx", "rcx");
+}
 #elif defined(CONFIG_CPU_CORTEX_M)
 #include <arch/arm/cortex_m/cmsis.h>
 static inline void timestamp_serialize(void)
diff --git a/tests/kernel/fatal/src/main.c b/tests/kernel/fatal/src/main.c
index e9aeaa00f2b..ffef259bab5 100644
--- a/tests/kernel/fatal/src/main.c
+++ b/tests/kernel/fatal/src/main.c
@@ -46,14 +46,15 @@ static volatile int crash_reason;
  * completing the exception path; the faulting thread is never run
  * again.
  *
- * On Xtensa/asm2 the handler is running in interrupt context and on
- * the interrupt stack and needs to return through the interrupt exit
- * code.
+ * On Xtensa/asm2 and x86_64 the handler is running in interrupt
+ * context and on the interrupt stack and needs to return through the
+ * interrupt exit code.
  *
  * In both cases the thread is guaranteed never to run again once we
  * return from the _SysFatalErrorHandler().
  */
-#if !(defined(CONFIG_ARM) || defined(CONFIG_XTENSA_ASM2) || defined(CONFIG_ARC))
+#if !(defined(CONFIG_ARM) || defined(CONFIG_XTENSA_ASM2) \
+	|| defined(CONFIG_ARC) || defined(CONFIG_X86_64))
 #define ERR_IS_NORETURN 1
 #endif
 
@@ -73,7 +74,7 @@ void _SysFatalErrorHandler(unsigned int reason, const NANO_ESF *pEsf)
 
 void alt_thread1(void)
 {
-#if defined(CONFIG_X86)
+#if defined(CONFIG_X86) || defined(CONFIG_X86_64)
 	__asm__ volatile ("ud2");
 #elif defined(CONFIG_NIOS2)
 	__asm__ volatile ("trap");