在"c语言写soap"文章中提到用C下载soap,其实也就是http内容。
地址:http://blog.chinaunix.net/u1/54401/showart.php?id=2310264
不过后来发现有个问题,有些 web server,比如lighttpd,不用content-leng,而是在http头和包体之间直接写长度。这样分析起来稍有困难。后来参考httpfetch代码,解决了这个问题。做到和常用浏览器兼容。
下载就采用那个文章上的方案,分析采用httpfetch,很简单,一个函数(其中调用几个工具函数):
int timeout = 2;
char *userAgent = NULL;
char *referer = NULL;
int hideUserAgent = 0;
int hideReferer = 1;
static int followRedirects = DEFAULT_REDIRECTS; /* # of redirects to follow */
extern const char *http_errlist[]; /* Array of HTTP Fetcher error messages */
extern char convertedError[128]; /* Buffer to used when errors contain %d */
static int errorSource = 0;
static int http_errno = 0;
static int errorInt = 0; /* When the error message has a %d in it,
* this variable is inserted */
/*
* Actually downloads the page, registering a hit (donation)
* If the fileBuf passed in is NULL, the url is downloaded and then
* freed; otherwise the necessary space is allocated for fileBuf.
* Returns size of download on success, -1 on error is set,
*/
int http_fetch(const char *url_tmp, char **fileBuf)
{
fd_set rfds;
struct timeval tv;
char headerBuf[HEADER_BUF_SIZE];
char *tmp, *url, *pageBuf, *requestBuf = NULL, *host, *charIndex;
int sock, bytesRead = 0, contentLength = -1, bufsize = REQUEST_BUF_SIZE;
int i,
ret = -1,
tempSize,
selectRet,
found = 0, /* For redirects */
redirectsFollowed = 0;
if(url_tmp == NULL)
{
errorSource = FETCHER_ERROR;
http_errno = HF_NULLURL;
return -1;
}
/* Copy the url passed in into a buffer we can work with, change, etc. */
url = (char*)malloc(strlen(url_tmp)+1);
if(url == NULL)
{
errorSource = ERRNO;
return -1;
}
strncpy(url, url_tmp, strlen(url_tmp) + 1);
/* This loop allows us to follow redirects if need be. An afterthought,
* added to provide this basic functionality. Will hopefully be designed
* better in 2.x.x ;) */
/* while(!found &&
(followRedirects < 0 || redirectsFollowed < followRedirects) )
*/ do
{
/* Seek to the file path portion of the url */
charIndex = strstr(url, "://");
if(charIndex != NULL)
{
/* url contains a protocol field */
charIndex += strlen("://");
host = charIndex;
charIndex = strchr(charIndex, '/');
}
else
{
host = (char *)url;
charIndex = strchr(url, '/');
}
/* Compose a request string */
requestBuf = (char*)malloc(bufsize);
if(requestBuf == NULL)
{
free(url);
errorSource = ERRNO;
return -1;
}
requestBuf[0] = 0;
if(charIndex == NULL)
{
/* The url has no '/' in it, assume the user is making a root-level
* request */
tempSize = strlen("GET /") + strlen(HTTP_VERSION) + 2;
if(_checkBufSize(&requestBuf, &bufsize, tempSize) ||
snprintf(requestBuf, bufsize, "GET / %s\r\n", HTTP_VERSION) < 0)
{
free(url);
free(requestBuf);
errorSource = ERRNO;
return -1;
}
}
else
{
tempSize = strlen("GET ") + strlen(charIndex) +
strlen(HTTP_VERSION) + 4;
/* + 4 is for ' ', '\r', '\n', and NULL */
if(_checkBufSize(&requestBuf, &bufsize, tempSize) ||
snprintf(requestBuf, bufsize, "GET %s %s\r\n",
charIndex, HTTP_VERSION) < 0)
{
free(url);
free(requestBuf);
errorSource = ERRNO;
return -1;
}
}
/* Null out the end of the hostname if need be */
if(charIndex != NULL)
*charIndex = 0;
/* Use Host: even though 1.0 doesn't specify it. Some servers
* won't play nice if we don't send Host, and it shouldn't
* hurt anything */
ret = bufsize - strlen(requestBuf); /* Space left in buffer */
tempSize = (int)strlen("Host: ") + (int)strlen(host) + 3;
/* +3 for "\r\n\0" */
if(_checkBufSize(&requestBuf, &bufsize, tempSize + 128))
{
free(url);
free(requestBuf);
errorSource = ERRNO;
return -1;
}
strcat(requestBuf, "Host: ");
strcat(requestBuf, host);
strcat(requestBuf, "\r\n");
if(!hideReferer && referer != NULL) /* NO default referer */
{
tempSize = (int)strlen("Referer: ") + (int)strlen(referer) + 3;
/* + 3 is for '\r', '\n', and NULL */
if(_checkBufSize(&requestBuf, &bufsize, tempSize))
{
free(url);
free(requestBuf);
errorSource = ERRNO;
return -1;
}
strcat(requestBuf, "Referer: ");
strcat(requestBuf, referer);
strcat(requestBuf, "\r\n");
}
if(!hideUserAgent && userAgent == NULL)
{
tempSize = (int)strlen("User-Agent: ") +
(int)strlen(DEFAULT_USER_AGENT) + (int)strlen(HTTP_VERSION) + 4;
/* + 4 is for '\', '\r', '\n', and NULL */
if(_checkBufSize(&requestBuf, &bufsize, tempSize))
{
free(url);
free(requestBuf);
errorSource = ERRNO;
return -1;
}
strcat(requestBuf, "User-Agent: ");
strcat(requestBuf, DEFAULT_USER_AGENT);
strcat(requestBuf, "/");
strcat(requestBuf, HTTP_VERSION);
strcat(requestBuf, "\r\n");
}
else if(!hideUserAgent)
{
tempSize = (int)strlen("User-Agent: ") + (int)strlen(userAgent) + 3;
/* + 3 is for '\r', '\n', and NULL */
if(_checkBufSize(&requestBuf, &bufsize, tempSize))
{
free(url);
free(requestBuf);
errorSource = ERRNO;
return -1;
}
strcat(requestBuf, "User-Agent: ");
strcat(requestBuf, userAgent);
strcat(requestBuf, "\r\n");
}
tempSize = (int)strlen("Connection: Close\r\n\r\n");
if(_checkBufSize(&requestBuf, &bufsize, tempSize))
{
free(url);
free(requestBuf);
errorSource = ERRNO;
return -1;
}
strcat(requestBuf, "Connection: Close\r\n\r\n");
/* Now free any excess memory allocated to the buffer */
tmp = (char*)realloc(requestBuf, strlen(requestBuf) + 1);
if(tmp == NULL)
{
free(url);
free(requestBuf);
errorSource = ERRNO;
return -1;
}
requestBuf = tmp;
sock = makeSocket(host); /* errorSource set within makeSocket */
if(sock == -1) { free(url); free(requestBuf); return -1;}
free(url);
url = NULL;
if(write(sock, requestBuf, strlen(requestBuf)) == -1)
{
close(sock);
free(requestBuf);
errorSource = ERRNO;
return -1;
}
free(requestBuf);
requestBuf = NULL;
/* Grab enough of the response to get the metadata */
ret = _http_read_header(sock, headerBuf); /* errorSource set within */
if(ret < 0) { close(sock); return -1; }
/* Get the return code */
charIndex = strstr(headerBuf, "HTTP/");
if(charIndex == NULL)
{
close(sock);
errorSource = FETCHER_ERROR;
http_errno = HF_FRETURNCODE;
return -1;
}
while(*charIndex != ' ')
charIndex++;
charIndex++;
ret = sscanf(charIndex, "%d", &i);
if(ret != 1)
{
close(sock);
errorSource = FETCHER_ERROR;
http_errno = HF_CRETURNCODE;
return -1;
}
if(i<200 || i>307)
{
close(sock);
errorInt = i; /* Status code, to be inserted in error string */
errorSource = FETCHER_ERROR;
http_errno = HF_STATUSCODE;
return -1;
}
/* If a redirect, repeat operation until final URL is found or we
* redirect followRedirects times. Note the case sensitive "Location",
* should probably be made more robust in the future (without relying
* on the non-standard strcasecmp()).
* This bit mostly by Dean Wilder, tweaked by me */
if(i >= 300)
{
redirectsFollowed++;
/* Pick up redirect URL, allocate new url, and repeat process */
charIndex = strstr(headerBuf, "Location:");
if(!charIndex)
{
close(sock);
errorInt = i; /* Status code, to be inserted in error string */
errorSource = FETCHER_ERROR;
http_errno = HF_CANTREDIRECT;
return -1;
}
charIndex += strlen("Location:");
/* Skip any whitespace... */
while(*charIndex != '\0' && isspace(*charIndex))
charIndex++;
if(*charIndex == '\0')
{
close(sock);
errorInt = i; /* Status code, to be inserted in error string */
errorSource = FETCHER_ERROR;
http_errno = HF_CANTREDIRECT;
return -1;
}
i = strcspn(charIndex, " \r\n");
if(i > 0)
{
url = (char *)malloc(i + 1);
strncpy(url, charIndex, i);
url[i] = '\0';
}
else
/* Found 'Location:' but contains no URL! We'll handle it as
* 'found', hopefully the resulting document will give the user
* a hint as to what happened. */
found = 1;
}
else
found = 1;
}
while(!found &&
(followRedirects < 0 || redirectsFollowed <= followRedirects) );
if(url) /* Redirection code may malloc this, then exceed followRedirects */
{
free(url);
url = NULL;
}
if(redirectsFollowed >= followRedirects && !found)
{
close(sock);
errorInt = followRedirects; /* To be inserted in error string */
errorSource = FETCHER_ERROR;
http_errno = HF_MAXREDIRECTS;
return -1;
}
/*
* Parse out about how big the data segment is.
* Note that under current HTTP standards (1.1 and prior), the
* Content-Length field is not guaranteed to be accurate or even present.
* I just use it here so I can allocate a ballpark amount of memory.
*
* Note that some servers use different capitalization
*/
charIndex = strstr(headerBuf, "Content-Length:");
if(charIndex == NULL)
charIndex = strstr(headerBuf, "Content-length:");
if(charIndex != NULL)
{
ret = sscanf(charIndex + strlen("content-length: "), "%d",
&contentLength);
if(ret < 1)
{
close(sock);
errorSource = FETCHER_ERROR;
http_errno = HF_CONTENTLEN;
return -1;
}
}
/* Allocate enough memory to hold the page */
if(contentLength == -1)
contentLength = DEFAULT_PAGE_BUF_SIZE;
pageBuf = (char *)malloc(contentLength);
if(pageBuf == NULL)
{
close(sock);
errorSource = ERRNO;
return -1;
}
/* Begin reading the body of the file */
while(ret > 0)
{
FD_ZERO(&rfds);
FD_SET(sock, &rfds);
tv.tv_sec = timeout;
tv.tv_usec = 0;
if(timeout >= 0)
selectRet = select(sock+1, &rfds, NULL, NULL, &tv);
else /* No timeout, can block indefinately */
selectRet = select(sock+1, &rfds, NULL, NULL, NULL);
if(selectRet == 0)
{
errorSource = FETCHER_ERROR;
http_errno = HF_DATATIMEOUT;
errorInt = timeout;
close(sock);
free(pageBuf);
return -1;
}
else if(selectRet == -1)
{
close(sock);
free(pageBuf);
errorSource = ERRNO;
return -1;
}
ret = read(sock, pageBuf + bytesRead, contentLength);
if(ret == -1)
{
close(sock);
free(pageBuf);
errorSource = ERRNO;
return -1;
}
bytesRead += ret;
if(ret > 0)
{
/* To be tolerant of inaccurate Content-Length fields, we'll
* allocate another read-sized chunk to make sure we have
* enough room.
*/
tmp = (char *)realloc(pageBuf, bytesRead + contentLength);
if(tmp == NULL)
{
close(sock);
free(pageBuf);
errorSource = ERRNO;
return -1;
}
pageBuf = tmp;
}
}
/*
* The download buffer is too large. Trim off the safety padding.
* Note that we add one NULL byte to the end of the data, as it may not
* already be NULL terminated and we can't be sure what type of data it
* is or what the caller will do with it.
*/
tmp = (char *)realloc(pageBuf, bytesRead + 1);
/* tmp shouldn't be null, since we're _shrinking_ the buffer,
* and if it DID fail, we could go on with the too-large buffer,
* but something would DEFINATELY be wrong, so we'll just give
* an error message */
if(tmp == NULL)
{
close(sock);
free(pageBuf);
errorSource = ERRNO;
return -1;
}
pageBuf = tmp;
pageBuf[bytesRead] = '\0'; /* NULL terminate the data */
if(fileBuf == NULL) /* They just wanted us to "hit" the url */
free(pageBuf);
else
*fileBuf = pageBuf;
close(sock);
return bytesRead;
}
int _checkBufSize(char **buf, int *bufsize, int more)
{
char *tmp;
int roomLeft = *bufsize - (strlen(*buf) + 1);
if(roomLeft > more)
return 0;
tmp = (char*)realloc(*buf, *bufsize + more + 1);
if(tmp == NULL)
return -1;
*buf = tmp;
*bufsize += more + 1;
return 0;
}
int makeSocket(const char *host)
{
int sock; /* Socket descriptor */
struct sockaddr_in sa; /* Socket address */
struct hostent *hp; /* Host entity */
int ret;
int port;
char *p;
/* Check for port number specified in URL */
p = strchr(host, ':');
if(p)
{
port = atoi(p + 1);
*p = '\0';
}
else
port = PORT_NUMBER;
hp = gethostbyname(host);
if(hp == NULL) { errorSource = H_ERRNO; return -1; }
/* Copy host address from hostent to (server) socket address */
memcpy((char *)&sa.sin_addr, (char *)hp->h_addr, hp->h_length);
sa.sin_family = hp->h_addrtype; /* Set service sin_family to PF_INET */
sa.sin_port = htons(port); /* Put portnum into sockaddr */
sock = socket(hp->h_addrtype, SOCK_STREAM, 0);
if(sock == -1) { errorSource = ERRNO; return -1; }
ret = connect(sock, (struct sockaddr *)&sa, sizeof(sa));
if(ret == -1) { errorSource = ERRNO; return -1; }
return sock;
}
int _http_read_header(int sock, char *headerPtr)
{
fd_set rfds;
struct timeval tv;
int bytesRead = 0, newlines = 0, ret, selectRet;
while(newlines != 2 && bytesRead != HEADER_BUF_SIZE)
{
FD_ZERO(&rfds);
FD_SET(sock, &rfds);
tv.tv_sec = timeout;
tv.tv_usec = 0;
if(timeout >= 0)
selectRet = select(sock+1, &rfds, NULL, NULL, &tv);
else /* No timeout, can block indefinately */
selectRet = select(sock+1, &rfds, NULL, NULL, NULL);
if(selectRet == 0)
{
errorSource = FETCHER_ERROR;
http_errno = HF_HEADTIMEOUT;
errorInt = timeout;
return -1;
}
else if(selectRet == -1) { errorSource = ERRNO; return -1; }
ret = read(sock, headerPtr, 1);
if(ret == -1) { errorSource = ERRNO; return -1; }
bytesRead++;
if(*headerPtr == '\r') /* Ignore CR */
{
/* Basically do nothing special, just don't set newlines
* to 0 */
headerPtr++;
continue;
}
else if(*headerPtr == '\n') /* LF is the separator */
newlines++;
else
newlines = 0;
headerPtr++;
}
headerPtr -= 3; /* Snip the trailing LF's */
*headerPtr = '\0';
return bytesRead;
}
const char *http_strerror()
{
extern int errno;
if(errorSource == ERRNO)
return strerror(errno);
else if(errorSource == H_ERRNO)
#ifdef HAVE_HSTRERROR
return hstrerror(h_errno);
#else
return http_errlist[HF_HERROR];
#endif
else if(errorSource == FETCHER_ERROR)
{
if(strstr(http_errlist[http_errno], "%d") == NULL)
return http_errlist[http_errno];
else
{
/* The error string has a %d in it, we need to insert errorInt.
* convertedError[128] has been declared for that purpose */
char *stringIndex, *originalError;
originalError = (char *)http_errlist[http_errno];
convertedError[0] = 0; /* Start off with NULL */
stringIndex = strstr(originalError, "%d");
strncat(convertedError, originalError, /* Copy up to %d */
abs(stringIndex - originalError));
sprintf(&convertedError[strlen(convertedError)],"%d",errorInt);
stringIndex += 2; /* Skip past the %d */
strcat(convertedError, stringIndex);
return convertedError;
}
}
return http_errlist[HF_METAERROR]; /* Should NEVER happen */
}
阅读(4944) | 评论(0) | 转发(0) |