1 /* Simple program to layout "physical" memory for new lguest guest.
2 * Linked high to avoid likely physical memory. */
3 #define _LARGEFILE64_SOURCE
4 #define _GNU_SOURCE
5 #include <stdio.h>
6 #include <string.h>
7 #include <unistd.h>
8 #include <err.h>
9 #include <stdint.h>
10 #include <stdlib.h>
11 #include <elf.h>
12 #include <sys/mman.h>
13 #include <sys/types.h>
14 #include <sys/stat.h>
15 #include <sys/wait.h>
16 #include <fcntl.h>
17 #include <stdbool.h>
18 #include <errno.h>
19 #include <ctype.h>
20 #include <sys/socket.h>
21 #include <sys/ioctl.h>
22 #include <sys/time.h>
23 #include <time.h>
24 #include <netinet/in.h>
25 #include <net/if.h>
26 #include <linux/sockios.h>
27 #include <linux/if_tun.h>
28 #include <sys/uio.h>
29 #include <termios.h>
30 #include <getopt.h>
31 #include <zlib.h>
32 typedef unsigned long long u64;
33 typedef uint32_t u32;
34 typedef uint16_t u16;
35 typedef uint8_t u8;
36 #include "../../include/linux/lguest_launcher.h"
37 #include "../../include/asm-i386/e820.h"
38
39 #define PAGE_PRESENT 0x7 /* Present, RW, Execute */
40 #define NET_PEERNUM 1
41 #define BRIDGE_PFX "bridge:"
42 #ifndef SIOCBRADDIF
43 #define SIOCBRADDIF 0x89a2 /* add interface to bridge */
44 #endif
45
46 static bool verbose;
47 #define verbose(args...) \
48 do { if (verbose) printf(args); } while(0)
49 static int waker_fd;
50
51 struct device_list
52 {
53 fd_set infds;
54 int max_infd;
55
56 struct device *dev;
57 struct device **lastdev;
58 };
59
60 struct device
61 {
62 struct device *next;
63 struct lguest_device_desc *desc;
64 void *mem;
65
66 /* Watch this fd if handle_input non-NULL. */
67 int fd;
68 bool (*handle_input)(int fd, struct device *me);
69
70 /* Watch DMA to this key if handle_input non-NULL. */
71 unsigned long watch_key;
72 u32 (*handle_output)(int fd, const struct iovec *iov,
73 unsigned int num, struct device *me);
74
75 /* Device-specific data. */
76 void *priv;
77 };
78
79 static int open_or_die(const char *name, int flags)
80 {
81 int fd = open(name, flags);
82 if (fd < 0)
83 err(1, "Failed to open %s", name);
84 return fd;
85 }
86
87 static void *map_zeroed_pages(unsigned long addr, unsigned int num)
88 {
89 static int fd = -1;
90
/* /dev/zero------该设备无穷尽地提供,可以使用任何你需要的数目——
* 设备提供的要多的多。他可以用于向设备或文件写入字符串。*/
91 if (fd == -1)
92 fd = open_or_die("/dev/zero", O_RDONLY);
93
94 if (mmap((void *)addr, getpagesize() * num,
95 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0)
96 != (void *)addr)
/* void *mmap(void *addr, size_t len, int prot, int flags, int fd, off_t offset);
*
* 参数fd为即将映射到进程空间的文件描述字,一般由open()返回,同时,fd可以指定为-1,
* 此时须指定flags参数中的MAP_ANON,表明进行的是匿名映射(不涉及具体的文件名,避免
* 了文件的创建及打开,很显然只能用于具有亲缘关系的进程间通信)。
*
* len是映射到调用进程地址空间的字节数,它从被映射文件开头offset个字节开始算起。
*
* prot参数指定共享内存的访问权限。可取如下几个值的或:PROT_READ(可读),
* PROT_WRITE(可写),PROT_EXEC(可执行),PROT_NONE(不可访问)。
*
* flags由以下几个常值指定:MAP_SHARED, MAP_PRIVATE, MAP_FIXED。其中,MAP_SHARED,
* MAP_PRIVATE必选其一,而MAP_FIXED则不推荐使用。
* 如果指定为MAP_SHARED,则对映射的内存所做的修改同样影响到文件。如果是MAP_PRIVATE,
* 则对映射的内存所做的修改仅对该进程可见,对文件没有影响。
*
* offset参数一般设为,表示从文件头开始映射。
*
* 参数addr指定文件应被映射到进程空间的起始地址,一般被指定一个空指针,此时选择起始
* 地址的任务留给内核来完成。函数的返回值为最后文件映射到进程空间的地址,进程可直接
* 操作起始地址为该值的有效地址。
*/
97 err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr);
98 return (void *)addr;
99 }
100
101 /* Find magic string marking entry point, return entry point. */
102 static unsigned long entry_point(void *start, void *end,
103 unsigned long page_offset)
104 {
105 void *p;
106
107 for (p = start; p < end; p++)
108 if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0)
109 return (long)p + strlen("GenuineLguest") + page_offset;
110
111 err(1, "Is this image a genuine lguest?");
112 }
113
114 /* Returns the entry point */
115 static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
116 unsigned long *page_offset)
117 {
118 void *addr;
119 Elf32_Phdr phdr[ehdr->e_phnum];
120 unsigned int i;
121 unsigned long start = -1UL, end = 0;
122
123 /* Sanity checks. */
124 if (ehdr->e_type != ET_EXEC
125 || ehdr->e_machine != EM_386
126 || ehdr->e_phentsize != sizeof(Elf32_Phdr)
127 || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
128 errx(1, "Malformed elf header");
129
130 if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
131 err(1, "Seeking to program headers");
132 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
133 err(1, "Reading program headers");
134
135 *page_offset = 0;
136 /* We map the loadable segments at virtual addresses corresponding
137 * to their physical addresses (our virtual == guest physical). */
138 for (i = 0; i < ehdr->e_phnum; i++) {
139 if (phdr[i].p_type != PT_LOAD)
140 continue;
141
142 verbose("Section %i: size %i addr %p\n",
143 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
144
145 /* We expect linear address space. */
146 if (!*page_offset)
147 *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
148 else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
149 errx(1, "Page offset of section %i different", i);
150
151 if (phdr[i].p_paddr < start)
152 start = phdr[i].p_paddr;
153 if (phdr[i].p_paddr + phdr[i].p_filesz > end)
154 end = phdr[i].p_paddr + phdr[i].p_filesz;
155
156 /* We map everything private, writable. */
157 addr = mmap((void *)phdr[i].p_paddr,
158 phdr[i].p_filesz,
159 PROT_READ|PROT_WRITE|PROT_EXEC,
160 MAP_FIXED|MAP_PRIVATE,
161 elf_fd, phdr[i].p_offset);
162 if (addr != (void *)phdr[i].p_paddr)
163 err(1, "Mmaping vmlinux seg %i gave %p not %p",
164 i, addr, (void *)phdr[i].p_paddr);
165 }
166
167 return entry_point((void *)start, (void *)end, *page_offset);
168 }
169
170 /* This is amazingly reliable. */
171 static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
172 {
173 unsigned int i, possibilities[256] = { 0 };
174
175 for (i = 0; i + 4 < len; i++) {
176 /* mov 0xXXXXXXXX,%eax */
177 if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3)
178 return (unsigned long)img[i+4] << 24;
179 }
180 errx(1, "could not determine page offset");
181 }
182
183 static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
184 {
185 gzFile f;
186 int ret, len = 0;
187 void *img = (void *)0x100000;
188
189 f = gzdopen(fd, "rb");
190 while ((ret = gzread(f, img + len, 65536)) > 0)
191 len += ret;
192 if (ret < 0)
193 err(1, "reading image from bzImage");
194
195 verbose("Unpacked size %i addr %p\n", len, img);
196 *page_offset = intuit_page_offset(img, len);
197
198 return entry_point(img, img + len, *page_offset);
199 }
200
201 static unsigned long load_bzimage(int fd, unsigned long *page_offset)
202 {
203 unsigned char c;
204 int state = 0;
205
206 /* Ugly brute force search for gzip header. */
207 while (read(fd, &c, 1) == 1) {
208 switch (state) {
209 case 0:
210 if (c == 0x1F)
211 state++;
212 break;
213 case 1:
214 if (c == 0x8B)
215 state++;
216 else
217 state = 0;
218 break;
219 case 2 ... 8:
220 state++;
221 break;
222 case 9:
223 lseek(fd, -10, SEEK_CUR);
224 if (c != 0x03) /* Compressed under UNIX. */
225 state = -1;
226 else
227 return unpack_bzimage(fd, page_offset);
228 }
229 }
230 errx(1, "Could not find kernel in bzImage");
231 }
232
233 static unsigned long load_kernel(int fd, unsigned long *page_offset)
234 {
235 Elf32_Ehdr hdr;
236
237 if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
238 err(1, "Reading kernel");
239
240 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
241 return map_elf(fd, &hdr, page_offset);
242
243 return load_bzimage(fd, page_offset);
244 }
245
246 static inline unsigned long page_align(unsigned long addr)
247 {
248 return ((addr + getpagesize()-1) & ~(getpagesize()-1));
249 }
250
251 /* initrd gets loaded at top of memory: return length. */
252 static unsigned long load_initrd(const char *name, unsigned long mem)
253 {
254 int ifd;
255 struct stat st;
256 unsigned long len;
257 void *iaddr;
258
259 ifd = open_or_die(name, O_RDONLY);
260 if (fstat(ifd, &st) < 0)
261 err(1, "fstat() on initrd '%s'", name);
262
263 len = page_align(st.st_size);
264 iaddr = mmap((void *)mem - len, st.st_size,
265 PROT_READ|PROT_EXEC|PROT_WRITE,
266 MAP_FIXED|MAP_PRIVATE, ifd, 0);
267 if (iaddr != (void *)mem - len)
268 err(1, "Mmaping initrd '%s' returned %p not %p",
269 name, iaddr, (void *)mem - len);
270 close(ifd);
271 verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr);
272 return len;
273 }
274
275 static unsigned long setup_pagetables(unsigned long mem,
276 unsigned long initrd_size,
277 unsigned long page_offset)
278 {
279 u32 *pgdir, *linear;
280 unsigned int mapped_pages, i, linear_pages;
281 unsigned int ptes_per_page = getpagesize()/sizeof(u32);
282
283 /* If we can map all of memory above page_offset, we do so. */
284 if (mem <= -page_offset)
285 mapped_pages = mem/getpagesize();
286 else
287 mapped_pages = -page_offset/getpagesize();
288
289 /* Each linear PTE page can map ptes_per_page pages. */
290 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
291
292 /* We lay out top-level then linear mapping immediately below initrd */
293 pgdir = (void *)mem - initrd_size - getpagesize();
294 linear = (void *)pgdir - linear_pages*getpagesize();
295
296 for (i = 0; i < mapped_pages; i++)
297 linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
298
299 /* Now set up pgd so that this memory is at page_offset */
300 for (i = 0; i < mapped_pages; i += ptes_per_page) {
301 pgdir[(i + page_offset/getpagesize())/ptes_per_page]
302 = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT);
303 }
304
305 verbose("Linear mapping of %u pages in %u pte pages at %p\n",
306 mapped_pages, linear_pages, linear);
307
308 return (unsigned long)pgdir;
309 }
310
311 static void concat(char *dst, char *args[])
312 {
313 unsigned int i, len = 0;
314
315 for (i = 0; args[i]; i++) {
316 strcpy(dst+len, args[i]);
317 strcat(dst+len, " ");
318 len += strlen(args[i]) + 1;
319 }
320 /* In case it's empty. */
321 dst[len] = '\0';
322 }
323
324 static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
325 {
326 u32 args[] = { LHREQ_INITIALIZE,
327 LGUEST_GUEST_TOP/getpagesize(), /* Just below us */
328 pgdir, start, page_offset };
329 int fd;
330
331 fd = open_or_die("/dev/lguest", O_RDWR);
332 if (write(fd, args, sizeof(args)) < 0)
333 err(1, "Writing to /dev/lguest");
334 return fd;
335 }
336
337 static void set_fd(int fd, struct device_list *devices)
338 {
339 FD_SET(fd, &devices->infds);
340 if (fd > devices->max_infd)
341 devices->max_infd = fd;
342 }
343
344/* When input arrives, we tell the kernel to kick lguest out with -EAGAIN.*/
345 static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices)
346 {
347 set_fd(pipefd, devices);
348
349 for (;;) {
350 fd_set rfds = devices->infds;
351 u32 args[] = { LHREQ_BREAK, 1 };
352
/* select()系统调用可以使进程检测同时等待的多个I/O设备,当没有设备准备
* 好时,select()阻塞,其中任一设备准备好时,select()就返回。*/
353 select(devices->max_infd+1, &rfds, NULL, NULL, NULL);
354 if (FD_ISSET(pipefd, &rfds)) {
355 int ignorefd;
356 if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0)
357 exit(0);
358 FD_CLR(ignorefd, &devices->infds);
359 } else
360 write(lguest_fd, args, sizeof(args));
361 }
362 }
363
364 static int setup_waker(int lguest_fd, struct device_list *device_list)
365 {
366 int pipefd[2], child;
367
368 pipe(pipefd);
369 child = fork();
370 if (child == -1)
371 err(1, "forking");
372
373 if (child == 0) {
374 close(pipefd[1]);
375 wake_parent(pipefd[0], lguest_fd, device_list);
376 }
377 close(pipefd[0]);
378
379 return pipefd[1];
380 }
381
382 static void *_check_pointer(unsigned long addr, unsigned int size,
383 unsigned int line)
384 {
385 if (addr >= LGUEST_GUEST_TOP || addr + size >= LGUEST_GUEST_TOP)
386 errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr);
387 return (void *)addr;
388 }
389 #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
390
391 /* Returns pointer to dma->used_len */
392 static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
393 {
394 unsigned int i;
395 struct lguest_dma *udma;
396
397 udma = check_pointer(dma, sizeof(*udma));
398 for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
399 if (!udma->len[i])
400 break;
401
//struct iovec
// {
// void __user *iov_base; /* BSD uses caddr_t (1003.1g requires void *) */
// __kernel_size_t iov_len; /* Must be size_t (1003.1g) */
//};
402 iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]);
403 iov[i].iov_len = udma->len[i];
404 }
405 *num = i;
406 return &udma->used_len;
407 }
408
409 static u32 *get_dma_buffer(int fd, void *key,
410 struct iovec iov[], unsigned int *num, u32 *irq)
411 {
412 u32 buf[] = { LHREQ_GETDMA, (u32)key };
413 unsigned long udma;
414 u32 *res;
415
416 udma = write(fd, buf, sizeof(buf));
417 if (udma == (unsigned long)-1)
418 return NULL;
419
420 /* Kernel stashes irq in ->used_len. */
421 res = dma2iov(udma, iov, num);
422 *irq = *res;
423 return res;
424 }
425
426 static void trigger_irq(int fd, u32 irq)
427 {
428 u32 buf[] = { LHREQ_IRQ, irq };
429 if (write(fd, buf, sizeof(buf)) != 0)
430 err(1, "Triggering irq %i", irq);
431 }
432
433 static void discard_iovec(struct iovec *iov, unsigned int *num)
434 {
435 static char discard_buf[1024];
436 *num = 1;
437 iov->iov_base = discard_buf;
438 iov->iov_len = sizeof(discard_buf);
439 }
440
441 static struct termios orig_term;
442 static void restore_term(void)
443 {
// tcsetattr函数用于设置终端参数
444 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
445 }
446
447 struct console_abort
448 {
449 int count;
// struct timeval {
// time_t tv_sec; /* seconds */
// suseconds_t tv_usec; /* microseconds */
// };
450 struct timeval start;
451 };
452
453 /* We DMA input to buffer bound at start of console page. */
454 static bool handle_console_input(int fd, struct device *dev)
455 {
456 u32 irq = 0, *lenp;
457 int len;
458 unsigned int num;
459 struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
460 struct console_abort *abort = dev->priv;
461
462 lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq);
463 if (!lenp) {
464 warn("console: no dma buffer!");
465 discard_iovec(iov, &num);
466 }
467
// 一个readv 调用被期望来轮流读取指示的数量到每个缓存
468 len = readv(dev->fd, iov, num);
469 if (len <= 0) {
470 warnx("Failed to get console input, ignoring console.");
471 len = 0;
472 }
473
474 if (lenp) {
475 *lenp = len;
476 trigger_irq(fd, irq);
477 }
478
479 /* Three ^C within one second? Exit. */
480 if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) {
481 if (!abort->count++)
482 gettimeofday(&abort->start, NULL);
483 else if (abort->count == 3) {
484 struct timeval now;
485 gettimeofday(&now, NULL);
486 if (now.tv_sec <= abort->start.tv_sec+1) {
487 /* Make sure waker is not blocked in BREAK */
488 u32 args[] = { LHREQ_BREAK, 0 };
489 close(waker_fd);
490 write(fd, args, sizeof(args));
491 exit(2);
492 }
493 abort->count = 0;
494 }
495 } else
496 abort->count = 0;
497
498 if (!len) {
499 restore_term();
500 return false;
501 }
502 return true;
503 }
504
505 static u32 handle_console_output(int fd, const struct iovec *iov,
506 unsigned num, struct device*dev)
507 {
// writev 要收集每个缓存的内容到一起并且作为单个写操作送出它们
508 return writev(STDOUT_FILENO, iov, num);
509 }
510
511 static u32 handle_tun_output(int fd, const struct iovec *iov,
512 unsigned num, struct device *dev)
513 {
514 /* Now we've seen output, we should warn if we can't get buffers. */
515 *(bool *)dev->priv = true;
516 return writev(dev->fd, iov, num);
517 }
518
519 static unsigned long peer_offset(unsigned int peernum)
520 {
521 return 4 * peernum;
522 }
523
524 static bool handle_tun_input(int fd, struct device *dev)
525 {
526 u32 irq = 0, *lenp;
527 int len;
528 unsigned num;
529 struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
530
531 lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num,
532 &irq);
533 if (!lenp) {
534 if (*(bool *)dev->priv)
535 warn("network: no dma buffer!");
536 discard_iovec(iov, &num);
537 }
538
539 len = readv(dev->fd, iov, num);
540 if (len <= 0)
541 err(1, "reading network");
542 if (lenp) {
543 *lenp = len;
544 trigger_irq(fd, irq);
545 }
546 verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
547 ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1],
548 lenp ? "sent" : "discarded");
549 return true;
550 }
551
552 static u32 handle_block_output(int fd, const struct iovec *iov,
553 unsigned num, struct device *dev)
554 {
555 struct lguest_block_page *p = dev->mem;
556 u32 irq, *lenp;
557 unsigned int len, reply_num;
558 struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
559 off64_t device_len, off = (off64_t)p->sector * 512;
560
561 device_len = *(off64_t *)dev->priv;
562
563 if (off >= device_len)
564 err(1, "Bad offset %llu vs %llu", off, device_len);
565 if (lseek64(dev->fd, off, SEEK_SET) != off)
566 err(1, "Bad seek to sector %i", p->sector);
567
568 verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
569
570 lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
571 if (!lenp)
572 err(1, "Block request didn't give us a dma buffer");
573
574 if (p->type) {
575 len = writev(dev->fd, iov, num);
576 if (off + len > device_len) {
577 ftruncate(dev->fd, device_len);
578 errx(1, "Write past end %llu+%u", off, len);
579 }
580 *lenp = 0;
581 } else {
582 len = readv(dev->fd, reply, reply_num);
583 *lenp = len;
584 }
585
586 p->result = 1 + (p->bytes != len);
587 trigger_irq(fd, irq);
588 return 0;
589 }
590
|