Chinaunix首页 | 论坛 | 博客
  • 博客访问: 135528
  • 博文数量: 30
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 550
  • 用 户 组: 普通用户
  • 注册时间: 2013-09-08 20:28
个人简介

永远不要放弃学习,放弃学习就是放弃了勇敢生活下去的动力!

文章分类

全部博文(30)

文章存档

2014年(30)

我的朋友

分类: LINUX

2014-05-03 19:19:44

原文地址:linux c网页爬虫应用 作者:qiyuefeng11

程序要求:通过socket函数编程,实现http get 功能。对返回的数据(页面的源代码)进行解析,提取出该界面的超链接,并存放至文件中。
程序如下:

点击(此处)折叠或打开

  1. #include <stdio.h>
  2. #include <sys/socket.h>
  3. #include <sys/types.h>
  4. #include <time.h>
  5. #include <errno.h>
  6. #include <signal.h>
  7. #include <stdlib.h>
  8. #include <string.h>
  9. #include <unistd.h>
  10. #include <sys/wait.h>
  11. #include <sys/time.h>
  12. #include <netdb.h>
  13. #include <netinet/in.h>
  14. #include <arpa/inet.h>
  15. #define PORT 80
  16. #define BUFSIZE 8184
  17. static FILE *frontier;


  18. void parse(char *buf)
  19. {
  20.         //解析所get的存储于buf的信息,从中解析出将要爬的超链接,存储于frontier.txt文件中
  21.      char *pts=buf,*qts;
  22.     
  23.     while((pts=strstr(pts,"a href="http:"))&&(qts=strstr(pts+9,"""))) //从buf中查找"a =href"http:"
  24.     {
  25.         
  26.         fwrite(pts+15,qts-pts-15,1,frontier);//向frontier.txt文件中写找到的超链接
  27.         putc('n',frontier);
  28.         fflush(frontier);
  29.         pts=qts;
  30.     }
  31.     
  32. }
  33. //函数封装
  34. int httpget(char *url)
  35. {
  36.     
  37.      FILE *fp;
  38.      char *host_id;
  39.      
  40.      struct hostent *host;
  41.      int sockfd, ret, i, h;
  42.      struct sockaddr_in servaddr;
  43.      char str1[4096],buf[8184],*str;
  44.      socklen_t len;
  45.      fd_set t_set1;
  46.      struct timeval tv;
  47.     
  48.     
  49.     if((host = gethostbyname(url)) == NULL)
  50.      {
  51.          
  52.          printf("gethostbyname error");
  53.          exit(-1);
  54.      
  55.      }

  56.     host_id = inet_ntoa(*((struct in_addr*)host -> h_addr));
  57.     printf("ip adress %sn",host_id);
  58.     
  59.     
  60.      if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0 ) {

  61.               printf("socket error!n");
  62.               exit(0);
  63.         }

  64.         bzero(&servaddr, sizeof(servaddr));
  65.         servaddr.sin_family = AF_INET;
  66.         servaddr.sin_port = htons(PORT);

  67.       if (inet_pton(AF_INET,host_id,&servaddr.sin_addr) <= 0 ) {
  68.      
  69.           printf("inet_pton error!n");
  70.           exit(0);
  71.       }
  72.      
  73.         if (connect(sockfd, (struct sockaddr *)&servaddr, sizeof(servaddr)) < 0)
  74.        {     
  75.                 printf("connect error!n");
  76.                 exit(0);
  77.        }
  78.                 printf("connect success n");

  79.        memset(str1, 0, 4096);
  80.      
  81.      //初始get请求信息
  82.        strcat(str1, "GET / HTTP/1.0rn");
  83.        strcat(str1, "Accept: */*rn");
  84.        strcat(str1, "Accept-Language: zh-CNrn");
  85.        strcat(str1, "User-Agent: Mozilla/4.0rn");
  86.        sprintf(str1,"HOST: %srn",url);
  87.        strcat(str1,"Connection: Keep-Alivern");
  88.        strcat(str1, "rnrn");
  89.        printf("%sn",str1);

  90.        ret = send(sockfd,(void *)str1,strlen(str1),0);
  91.        if (ret < 0) {
  92.                 printf("send error %d,Error message'%s'n",errno, strerror(errno));
  93.                 exit(0);

  94.         }else{

  95.                 printf("send success ,total send %d n", ret);
  96.         }


  97.      while(1){
  98.    
  99.                 sleep(2);
  100.                 printf("******n");
  101.                 tv.tv_sec= 0;
  102.                 tv.tv_usec= 0;

  103.                 h= 0;
  104.                 
  105.                       FD_ZERO(&t_set1);
  106.                 FD_SET(sockfd, &t_set1);
  107.                 printf("--------------->1n");
  108.                 h= select(sockfd +1, &t_set1, NULL, NULL, &tv);
  109.                  
  110.                 printf("--------------->2n");
  111.                
  112.                 if (h == 0) continue;
  113.                 if (h < 0) {
  114.                          close(sockfd);
  115.                        printf("some thing read error!n");
  116.                        return -1;

  117.                  };
  118.     
  119.                 if (h > 0){
  120.                         memset(buf, 0, 8184);
  121.                         i= recv(sockfd, (void *)buf, 8184,0);
  122.                         printf("i = %dn",i);
  123.                           if (i==0){

  124.                                close(sockfd);
  125.                                printf("read message find error,stop!n");
  126.                                return -1;

  127.                        }
  128.                 // fwrite(buf,sizeof(char),strlen(buf),fp);
  129.                   // fflush(fp);
  130.                        parse(buf);
  131.                    printf("%sn", buf);
  132.                   
  133.                    

  134.                  }

  135.          }

  136.          close(sockfd);
  137.          return 0;
  138.     
  139. }
  140. int main(int argc, char *argv[])  
    {

         
         FILE *fp;
         int res;
         char buf[BUFSIZE], *str;  
           
          if(argc != 2)
         {
             fprintf(stderr,"input domain name");
             exit(-1);
         
         }

         frontier=fopen("frontier.txt","a+");
          if(frontier==NULL)
          {
          printf("open error");
          return 1;
          }

        if((fp = fopen("111.txt","a+")) < 0)
            {
                printf("fopen error");
                exit(-1);
        
            }

         if((res = httpget(argv[1])) == 0)
             {
                        printf("httpget success\n");
                        exit(-1);         
             }
             
        return 0;  
    }



阅读(1474) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~