Chinaunix首页 | 论坛 | 博客
  • 博客访问: 1326542
  • 博文数量: 179
  • 博客积分: 4141
  • 博客等级: 中将
  • 技术积分: 2083
  • 用 户 组: 普通用户
  • 注册时间: 2009-03-21 20:04
文章存档

2024年(1)

2019年(13)

2016年(1)

2014年(16)

2011年(8)

2010年(25)

2009年(115)

分类: 虚拟化

2009-03-22 15:59:02


/*P:010
 * A hypervisor allows multiple Operating Systems to run on a single machine.
 * To quote David Wheeler: "Any problem in computer science can be solved with
 * another layer of indirection."
 *
 * We keep things simple in two ways. First, we start with a normal Linux
 * kernel and insert a module (lg.ko) which allows us to run other Linux
 * kernels the same way we'd run processes. We call the first kernel the Host,
 * and the others the Guests. The program which sets up and configures Guests
 * (such as the example in Documentation/lguest/lguest.c) is called the
 * Launcher.
 *
 * Secondly, we only run specially modified Guests, not normal kernels. When
 * you set CONFIG_LGUEST to 'y' or 'm', this automatically sets
 * CONFIG_LGUEST_GUEST=y, which compiles this file into the kernel so it knows
 * how to be a Guest. This means that you can use the same kernel you boot
 * normally (ie. as a Host) as a Guest.
 *
 * These Guests know that they cannot do privileged operations, such as disable
 * interrupts, and that they have to ask the Host to do such things explicitly.
 * This file consists of all the replacements for such low-level native
 * hardware operations: these special Guest versions call the Host.
 *
 * So how does the kernel know it's a Guest? The Guest starts at a special
 * entry point marked with a magic string, which sets up a few things then
 * calls here. We replace the native functions in "struct paravirt_ops"
 * with our Guest versions, then boot like normal. :*/


1 /*

2 * Lguest specific paravirt-ops implementation

3 *

4 * Copyright (C) 2006, Rusty Russell IBM Corporation.

5 *

6 * This program is free software; you can redistribute it and/or modify

7 * it under the terms of the GNU General Public License as published by

8 * the Free Software Foundation; either version 2 of the License, or

9 * (at your option) any later version.

10 *

11 * This program is distributed in the hope that it will be useful, but

12 * WITHOUT ANY WARRANTY; without even the implied warranty of

13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or

14 * NON INFRINGEMENT. See the GNU General Public License for more

15 * details.

16 *

17 * You should have received a copy of the GNU General Public License

18 * along with this program; if not, write to the Free Software

19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

20 */


21 #include <linux/kernel.h>

22 #include <linux/start_kernel.h>

23 #include <linux/string.h>

24 #include <linux/console.h>

25 #include <linux/screen_info.h>

26 #include <linux/irq.h>

27 #include <linux/interrupt.h>

28 #include <linux/lguest.h>

29 #include <linux/lguest_launcher.h>

30 #include <linux/lguest_bus.h>

31 #include <asm/paravirt.h>

32 #include <asm/param.h>

33 #include <asm/page.h>

34 #include <asm/pgtable.h>

35 #include <asm/desc.h>

36 #include <asm/setup.h>

37 #include <asm/e820.h>

38 #include <asm/mce.h>

39 #include <asm/io.h>

40

41 /* Declarations for definitions in lguest_guest.S */

42 extern char lguest_noirq_start[], lguest_noirq_end[];

43 extern const char lgstart_cli[], lgend_cli[];

44 extern const char lgstart_sti[], lgend_sti[];

45 extern const char lgstart_popf[], lgend_popf[];

46 extern const char lgstart_pushf[], lgend_pushf[];

47 extern const char lgstart_iret[], lgend_iret[];

48 extern void lguest_iret(void);

49

50 struct lguest_data lguest_data = {

51 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },

52 .noirq_start = (u32)lguest_noirq_start,

53 .noirq_end = (u32)lguest_noirq_end,

54 .blocked_interrupts = { 1 }, /* Block timer interrupts */

55 };

56 struct lguest_device_desc *lguest_devices;

57 static __initdata const struct lguest_boot_info *boot = __va(0);

58

59 static enum paravirt_lazy_mode lazy_mode;

60 static void lguest_lazy_mode(enum paravirt_lazy_mode mode)

61 {

62 if (mode == PARAVIRT_LAZY_FLUSH) {

63 if (unlikely(lazy_mode != PARAVIRT_LAZY_NONE))

64 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);

65 } else {

66 lazy_mode = mode;

67 if (mode == PARAVIRT_LAZY_NONE)

68 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);

69 }

70 }

71

72 static void lazy_hcall(unsigned long call,

73 unsigned long arg1,

74 unsigned long arg2,

75 unsigned long arg3)

76 {

77 if (lazy_mode == PARAVIRT_LAZY_NONE)

78 hcall(call, arg1, arg2, arg3);

79 else

80 async_hcall(call, arg1, arg2, arg3);

81 }

82

83 void async_hcall(unsigned long call,

84 unsigned long arg1, unsigned long arg2, unsigned long arg3)

85 {

86 /* Note: This code assumes we're uniprocessor. */

87 static unsigned int next_call;

88 unsigned long flags;

89

90 local_irq_save(flags);

91 if (lguest_data.hcall_status[next_call] != 0xFF) {

92 /* Table full, so do normal hcall which will flush table. */

93 hcall(call, arg1, arg2, arg3);

94 } else {

95 lguest_data.hcalls[next_call].eax = call;

96 lguest_data.hcalls[next_call].edx = arg1;

97 lguest_data.hcalls[next_call].ebx = arg2;

98 lguest_data.hcalls[next_call].ecx = arg3;

99 /* Make sure host sees arguments before "valid" flag. */

100 wmb();

101 lguest_data.hcall_status[next_call] = 0;

102 if (++next_call == LHCALL_RING_SIZE)

103 next_call = 0;

104 }

105 local_irq_restore(flags);

106 }

107

108 void lguest_send_dma(unsigned long key, struct lguest_dma *dma)

109 {

110 dma->used_len = 0;

111 hcall(LHCALL_SEND_DMA, key, __pa(dma), 0);

112 }

113

114 int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas,

115 unsigned int num, u8 irq)

116 {

117 if (!hcall(LHCALL_BIND_DMA, key, __pa(dmas), (num << 8) | irq))

118 return -ENOMEM;

119 return 0;

120 }

121

122 void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas)

123 {

124 hcall(LHCALL_BIND_DMA, key, __pa(dmas), 0);

125 }

126

127/* For guests,device memory can be used as normal memory,so we cast away the

128 * __iomem to quieten sparse. */


/**

 * ioremap_nocache - map bus memory into CPU space

 * @offset: bus address of the memory

 * @size: size of the resource to map

 */


129 void *lguest_map(unsigned long phys_addr, unsigned long pages)

130 {

131 return (__force void *)ioremap(phys_addr, PAGE_SIZE*pages);

132 }

133

134 void lguest_unmap(void *addr)

135 {

136 iounmap((__force void __iomem *)addr);

137 }

138

139 static unsigned long save_fl(void)

140 {

141 return lguest_data.irq_enabled;

142 }

143

144 static void restore_fl(unsigned long flags)

145 {

146 /* FIXME: Check if interrupt pending... */

147 lguest_data.irq_enabled = flags;

148 }

149

150 static void irq_disable(void)

151 {

152 lguest_data.irq_enabled = 0;

153 }

154

155 static void irq_enable(void)

156 {

157 /* FIXME: Check if interrupt pending... */

158 lguest_data.irq_enabled = X86_EFLAGS_IF;

159 }

160

161 static void lguest_write_idt_entry(struct desc_struct *dt,

162 int entrynum, u32 low, u32 high)

163 {

164 write_dt_entry(dt, entrynum, low, high);

165 hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high);

166 }

167

168 static void lguest_load_idt(const struct Xgt_desc_struct *desc)

169 {

170 unsigned int i;

171 struct desc_struct *idt = (void *)desc->address;

172

173 for (i = 0; i < (desc->size+1)/8; i++)

174 hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);

175 }

176

177 static void lguest_load_gdt(const struct Xgt_desc_struct *desc)

178 {

179 BUG_ON((desc->size+1)/8 != GDT_ENTRIES);

180 hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);

181 }

182

183 static void lguest_write_gdt_entry(struct desc_struct *dt,

184 int entrynum, u32 low, u32 high)

185 {

186 write_dt_entry(dt, entrynum, low, high);

187 hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);

188 }

189

190 static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)

191 {

192 lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);

193 }

194

195 static void lguest_set_ldt(const void *addr, unsigned entries)

196 {

197 }

198

199 static void lguest_load_tr_desc(void)

200 {

201 }

202

203 static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,

204 unsigned int *ecx, unsigned int *edx)

205 {

206 int function = *eax;

207

208 native_cpuid(eax, ebx, ecx, edx);

209 switch (function) {

210 case 1: /* Basic feature request. */

211 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */

212 *ecx &= 0x00002201;

213 /* Similarly: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */

214 *edx &= 0x07808101;

215 /* Host wants to know when we flush kernel pages: set PGE. */

216 *edx |= 0x00002000;

217 break;

218 case 0x80000000:

219 /* Futureproof this a little: if they ask how much extended

220 * processor information, limit it to known fields. */


221 if (*eax > 0x80000008)

222 *eax = 0x80000008;

223 break;

224 }

225 }

226

227 static unsigned long current_cr0, current_cr3;

228 static void lguest_write_cr0(unsigned long val)

229 {

230 lazy_hcall(LHCALL_TS, val & 8, 0, 0);

231 current_cr0 = val;

232 }

233

234 static unsigned long lguest_read_cr0(void)

235 {

236 return current_cr0;

237 }

238

239 static void lguest_clts(void)

240 {

241 lazy_hcall(LHCALL_TS, 0, 0, 0);

242 current_cr0 &= ~8U;

243 }

244

245 static unsigned long lguest_read_cr2(void)

246 {

247 return lguest_data.cr2;

248 }

249

250 static void lguest_write_cr3(unsigned long cr3)

251 {

252 lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);

253 current_cr3 = cr3;

254 }

255

256 static unsigned long lguest_read_cr3(void)

257 {

258 return current_cr3;

259 }

260

261 /* Used to enable/disable PGE, but we don't care. */

262 static unsigned long lguest_read_cr4(void)

263 {

264 return 0;

265 }

266

267 static void lguest_write_cr4(unsigned long val)

268 {

269 }

270

271 static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,

272 pte_t *ptep, pte_t pteval)

273 {

274 *ptep = pteval;

275 lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low);

276 }

277

278 /* We only support two-level pagetables at the moment. */

279 static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)

280 {

281 *pmdp = pmdval;

282 lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK,

283 (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);

284 }

285

286 /* FIXME: Eliminate all callers of this. */

287 static void lguest_set_pte(pte_t *ptep, pte_t pteval)

288 {

289 *ptep = pteval;

290 /* Don't bother with hypercall before initial setup. */

291 if (current_cr3)

292 lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);

293 }

294

295 static void lguest_flush_tlb_single(unsigned long addr)

296 {

297 /* Simply set it to zero, and it will fault back in. */

298 lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0);

299 }

300

301 static void lguest_flush_tlb_user(void)

302 {

303 lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);

304 }

305

306 static void lguest_flush_tlb_kernel(void)

307 {

308 lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);

309 }

310

311 static void disable_lguest_irq(unsigned int irq)

312 {

313 set_bit(irq, lguest_data.blocked_interrupts);

314 }

315

316 static void enable_lguest_irq(unsigned int irq)

317 {

318 clear_bit(irq, lguest_data.blocked_interrupts);

319 /* FIXME: If it's pending? */

320 }

321

322 static struct irq_chip lguest_irq_controller = {

323 .name = "lguest",

324 .mask = disable_lguest_irq,

325 .mask_ack = disable_lguest_irq,

326 .unmask = enable_lguest_irq,

327 };

328

329 static void __init lguest_init_IRQ(void)

330 {

331 unsigned int i;

332

333 for (i = 0; i < LGUEST_IRQS; i++) {

334 int vector = FIRST_EXTERNAL_VECTOR + i;

335 if (vector != SYSCALL_VECTOR) {

336 set_intr_gate(vector, interrupt[i]);

337 set_irq_chip_and_handler(i, &lguest_irq_controller,

338 handle_level_irq);

339 }

340 }

341 irq_ctx_init(smp_processor_id());

342 }

343

344 static unsigned long lguest_get_wallclock(void)

345 {

346 return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0);

347 }

348

349 static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)

350 {

351 do_timer(hcall(LHCALL_TIMER_READ, 0, 0, 0));

352 update_process_times(user_mode_vm(get_irq_regs()));

353 }

354

355 static u64 sched_clock_base;

356 static void lguest_time_init(void)

357 {

358 set_irq_handler(0, lguest_time_irq);

359 hcall(LHCALL_TIMER_READ, 0, 0, 0);

360 sched_clock_base = jiffies_64;

361 enable_lguest_irq(0);

362 }

363

364 static unsigned long long lguest_sched_clock(void)

365 {

366 return (jiffies_64 - sched_clock_base) * (1000000000 / HZ);

367 }

368

369 static void lguest_load_esp0(struct tss_struct *tss,

370 struct thread_struct *thread)

371 {

372 lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0,

373 THREAD_SIZE/PAGE_SIZE);

374 }

375

376 static void lguest_set_debugreg(int regno, unsigned long value)

377 {

378 /* FIXME: Implement */

379 }

380

381 static void lguest_wbinvd(void)

382 {

383 }

384

385 #ifdef CONFIG_X86_LOCAL_APIC

386 static void lguest_apic_write(unsigned long reg, unsigned long v)

387 {

388 }

389

390 static unsigned long lguest_apic_read(unsigned long reg)

391 {

392 return 0;

393 }

394 #endif

395

396 static void lguest_safe_halt(void)

397 {

398 hcall(LHCALL_HALT, 0, 0, 0);

399 }

400

401 static void lguest_power_off(void)

402 {

403 hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);

404 }

405

406 static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)

407 {

408 hcall(LHCALL_CRASH, __pa(p), 0, 0);

409 return NOTIFY_DONE;

410 }

411

412 static struct notifier_block paniced = {

413 .notifier_call = lguest_panic

414 };

415

416 static __init char *lguest_memory_setup(void)

417 {

418 /* We do this here because lockcheck barfs if before start_kernel */

419 atomic_notifier_chain_register(&panic_notifier_list, &paniced);

420

421 e820.nr_map = 0;

422 add_memory_region(0, PFN_PHYS(boot->max_pfn), E820_RAM);

423 return "LGUEST";

424 }

425

426 static const struct lguest_insns

427 {

428 const char *start, *end;

429 } lguest_insns[] = {

430 [PARAVIRT_PATCH(irq_disable)] = { lgstart_cli, lgend_cli },

431 [PARAVIRT_PATCH(irq_enable)] = { lgstart_sti, lgend_sti },

432 [PARAVIRT_PATCH(restore_fl)] = { lgstart_popf, lgend_popf },

433 [PARAVIRT_PATCH(save_fl)] = { lgstart_pushf, lgend_pushf },

434 };

435 static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len)

436 {

437 unsigned int insn_len;

438

439 /* Don't touch it if we don't have a replacement */

440 if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)

441 return paravirt_patch_default(type, clobber, insns, len);

442

443 insn_len = lguest_insns[type].end - lguest_insns[type].start;

444

445 /* Similarly if we can't fit replacement. */

446 if (len < insn_len)

447 return paravirt_patch_default(type, clobber, insns, len);

448

449 memcpy(insns, lguest_insns[type].start, insn_len);

450 return insn_len;

451 }

452

453 __init void lguest_init(void)

454 {

455 paravirt_ops.name = "lguest";

456 paravirt_ops.paravirt_enabled = 1;

457 paravirt_ops.kernel_rpl = 1;

458

459 paravirt_ops.save_fl = save_fl;

460 paravirt_ops.restore_fl = restore_fl;

461 paravirt_ops.irq_disable = irq_disable;

462 paravirt_ops.irq_enable = irq_enable;

463 paravirt_ops.load_gdt = lguest_load_gdt;

464 paravirt_ops.memory_setup = lguest_memory_setup;

465 paravirt_ops.cpuid = lguest_cpuid;

466 paravirt_ops.write_cr3 = lguest_write_cr3;

467 paravirt_ops.flush_tlb_user = lguest_flush_tlb_user;

468 paravirt_ops.flush_tlb_single = lguest_flush_tlb_single;

469 paravirt_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;

470 paravirt_ops.set_pte = lguest_set_pte;

471 paravirt_ops.set_pte_at = lguest_set_pte_at;

472 paravirt_ops.set_pmd = lguest_set_pmd;

473 #ifdef CONFIG_X86_LOCAL_APIC

474 paravirt_ops.apic_write = lguest_apic_write;

475 paravirt_ops.apic_write_atomic = lguest_apic_write;

476 paravirt_ops.apic_read = lguest_apic_read;

477 #endif

478 paravirt_ops.load_idt = lguest_load_idt;

479 paravirt_ops.iret = lguest_iret;

480 paravirt_ops.load_esp0 = lguest_load_esp0;

481 paravirt_ops.load_tr_desc = lguest_load_tr_desc;

482 paravirt_ops.set_ldt = lguest_set_ldt;

483 paravirt_ops.load_tls = lguest_load_tls;

484 paravirt_ops.set_debugreg = lguest_set_debugreg;

485 paravirt_ops.clts = lguest_clts;

486 paravirt_ops.read_cr0 = lguest_read_cr0;

487 paravirt_ops.write_cr0 = lguest_write_cr0;

488 paravirt_ops.init_IRQ = lguest_init_IRQ;

489 paravirt_ops.read_cr2 = lguest_read_cr2;

490 paravirt_ops.read_cr3 = lguest_read_cr3;

491 paravirt_ops.read_cr4 = lguest_read_cr4;

492 paravirt_ops.write_cr4 = lguest_write_cr4;

493 paravirt_ops.write_gdt_entry = lguest_write_gdt_entry;

494 paravirt_ops.write_idt_entry = lguest_write_idt_entry;

495 paravirt_ops.patch = lguest_patch;

496 paravirt_ops.safe_halt = lguest_safe_halt;

497 paravirt_ops.get_wallclock = lguest_get_wallclock;

498 paravirt_ops.time_init = lguest_time_init;

499 paravirt_ops.set_lazy_mode = lguest_lazy_mode;

500 paravirt_ops.wbinvd = lguest_wbinvd;

501 paravirt_ops.sched_clock = lguest_sched_clock;

502

503 hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);

504 strncpy(boot_command_line, boot->cmdline, COMMAND_LINE_SIZE);

505

506 /* We use top of mem for initial pagetables. */

507 init_pg_tables_end = __pa(pg0);

508

509 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");

510

511 reserve_top_address(lguest_data.reserve_mem);

512

513 lockdep_init();

514

515 paravirt_disable_iospace();

516

517 cpu_detect(&new_cpu_data);

518 /* head.S usually sets up the first capability word, so do it here. */

519 new_cpu_data.x86_capability[0] = cpuid_edx(1);

520

521 /* Math is always hard! */

522 new_cpu_data.hard_math = 1;

523

524 #ifdef CONFIG_X86_MCE

525 mce_disabled = 1;

526 #endif

527

528 #ifdef CONFIG_ACPI

529 acpi_disabled = 1;

530 acpi_ht = 0;

531 #endif

532

533 add_preferred_console("hvc", 0, NULL);

534

535 if (boot->initrd_size) {

536 /* We stash this at top of memory. */

537 INITRD_START = boot->max_pfn*PAGE_SIZE - boot->initrd_size;

538 INITRD_SIZE = boot->initrd_size;

539 LOADER_TYPE = 0xFF;

540 }

541

542 pm_power_off = lguest_power_off;

543 start_kernel();

544 }

 

阅读(2230) | 评论(0) | 转发(2) |
给主人留下些什么吧!~~