Chinaunix首页 | 论坛 | 博客
  • 博客访问: 4511082
  • 博文数量: 356
  • 博客积分: 10458
  • 博客等级: 上将
  • 技术积分: 4734
  • 用 户 组: 普通用户
  • 注册时间: 2008-03-24 14:59
文章分类

全部博文(356)

文章存档

2020年(17)

2019年(9)

2018年(26)

2017年(5)

2016年(11)

2015年(20)

2014年(2)

2013年(17)

2012年(15)

2011年(4)

2010年(7)

2009年(14)

2008年(209)

分类: C/C++

2012-07-01 13:40:17

最近用curl模拟一些网站的登录要对html进行解析,而libxml2要求比较严格,很多html页面无法正确解析,为此才接触到一个c库Html Tidy,故引出本文。

此库官方:

最新代码可通过cvs下载

cvs -d:pserver:anonymous@tidy.cvs.sourceforge.net:/cvsroot/tidy login
cvs -z3 -d:pserver:anonymous@tidy.cvs.sourceforge.net:/cvsroot/tidy co -P tidy

交叉编译

cd tidy
./build/gnuauto/setup.sh

mkdir ../build-tidy
cd ../build-tidy
../tidy/configure --host=arm-linux CC=arm-linux-gcc --prefix=`pwd`/out
mkdir ./out/include/tidy
mv ./out/include/*.h ./out/include/tidy/
make
make install
cd ../

cvs上下来的源码编译安装后include目录下少了tidy目录,而其示例程序用的是#include ,不手动添加就无法编译通过

作者:帅得不敢出门 C++哈哈堂<31843264>

废话不说,上示例:


点击(此处)折叠或打开

  1. #include <stdio.h>
  2. #include <tidy/tidy.h>
  3. #include <tidy/buffio.h>
  4. #include <curl/curl.h>

  5. /* curl write callback, to fill tidy's input buffer... */
  6. uint write_cb(char *in, uint size, uint nmemb, TidyBuffer *out) {
  7.   uint r;
  8.   r = size * nmemb;
  9.   tidyBufAppend( out, in, r );
  10.   return(r);
  11. }

  12. /* Traverse the document tree */
  13. void dumpNode(TidyDoc doc, TidyNode tnod, int indent ) {
  14.   TidyNode child;
  15.   for ( child = tidyGetChild(tnod); child; child = tidyGetNext(child) )
  16.   {
  17.     ctmbstr name = tidyNodeGetName( child );
  18.     if ( name )
  19.     {
  20.       /* if it has a name, then it's an HTML tag ... */
  21.       TidyAttr attr;
  22.       printf( "%*.*s%s ", indent, indent, "<", name);
  23.       /* walk the attribute list */
  24.       for ( attr=tidyAttrFirst(child); attr; attr=tidyAttrNext(attr) ) {
  25.         printf(tidyAttrName(attr));
  26.         tidyAttrValue(attr)?printf("=\"%s\" ", tidyAttrValue(attr)):printf(" ");
  27.       }
  28.       printf( ">\n");
  29.     } else {
  30.       /* if it doesn't have a name, then it's probably text, cdata, etc... */
  31.       TidyBuffer buf;
  32.       tidyBufInit(&buf);
  33.       tidyNodeGetText(doc, child, &buf);
  34.       printf("%*.*s\n", indent, indent, buf.bp?(char *)buf.bp:"");
  35.       tidyBufFree(&buf);
  36.     }
  37.     dumpNode( doc, child, indent + 4 ); /* recursive */
  38.   }
  39. }

  40. int main(int argc, char **argv ) {
  41.   CURL *curl;
  42.   char curl_errbuf[CURL_ERROR_SIZE];
  43.   TidyDoc tdoc;
  44.   TidyBuffer docbuf = {0};
  45.   TidyBuffer tidy_errbuf = {0};
  46.   int err;
  47.   if ( argc == 2) {
  48.     curl = curl_easy_init();
  49.     curl_easy_setopt(curl, CURLOPT_URL, argv[1]);
  50.     curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_errbuf);
  51.     curl_easy_setopt(curl, CURLOPT_NOPROGRESS, no);
  52.     curl_easy_setopt(curl, CURLOPT_VERBOSE, yes);
  53.     curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);

  54.     tdoc = tidyCreate();
  55.     tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */
  56.     tidyOptSetInt(tdoc, TidyWrapLen, 4096);
  57.     tidySetErrorBuffer( tdoc, &tidy_errbuf );
  58.     tidyBufInit(&docbuf);

  59.     curl_easy_setopt(curl, CURLOPT_WRITEDATA, &docbuf);
  60.     err=curl_easy_perform(curl);
  61.     if ( !err ) {
  62.       err = tidyParseBuffer(tdoc, &docbuf); /* parse the input */
  63.       if ( err >= 0 ) {
  64.         err = tidyCleanAndRepair(tdoc); /* fix any problems */
  65.         if ( err >= 0 ) {
  66.           err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */
  67.           if ( err >= 0 ) {
  68.             dumpNode( tdoc, tidyGetRoot(tdoc), 0 ); /* walk the tree */
  69.             fprintf(stderr, "%s\n", tidy_errbuf.bp); /* show errors */
  70.          }
  71.         }
  72.       }
  73.     } else fprintf(stderr, "%s\n", curl_errbuf);

  74.     /* clean-up */
  75.     curl_easy_cleanup(curl);
  76.     tidyBufFree(&docbuf);
  77.     tidyBufFree(&tidy_errbuf);
  78.     tidyRelease(tdoc);
  79.     return(err);

  80.   } else printf( "usage: %s \n", argv[0] );
  81.   return(0);
  82. }


阅读(4282) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~