/*
function:
success: return bytesRead[网页体信息的真实的字节数]
fail: return -1 各种其他的错误
return -2 在IP阻塞范围内
return -3 无效的主机号
return -4 image/text类型
return -300 网页重定向
strUrl: 待抓取的网页对应的URL
fileBuf: 网页体信息
fileHead:网页头信息
location:网页如果重定向对应的URL
sock:套接子文件描述符
*/
int CHttp::Fetch(string strUrl,char **fileBuf,char **fileHeadBuf,char **location,int *nPSock)
{
char *tmp,*url,*requestBuf,*pageBuf;
const char *host,*path;
int sock,bytesRead=0,bufsize=REQUEST_BUF_SIZE;
int ret=-1,tempSize,selectRet;
int port=80;//默认端口号为80
if(strUrl.empty())//空的URL肯定不能抓取到网页
{
cout<<"strUrl is NULL in CHttp::Fecth()"<<endl;
return -1;
}
url=(char *)strdup(strUrl.c_str());
if(url==NULL)
{
cout<<"strUrl strdup() error in CHttp::Fetch()"<<endl;
return -1;
}
CUrl u;
if(u.ParseUrlEx(strUrl)==false)
{//如果没有"http://"协议号,肯定会解析错误
cout<<"strUrl ParseUrlEx() error in CHttp::Fetch()"<<endl;
return -1;
}
host=u.m_sHost.c_str();
path=u.m_sPath.c_str();
if(u.m_nPort>0)
port=u.m_nPort;
/*构造HTTP请求报文: 假设strUrl="http://www.baidu.com/ecjtu/nihao.html"*/
// GET /ecjtu/nihao.html HTTP/1.0\r\n
requestBuf=(char *)malloc(bufsize);
if(requestBuf==NULL)
{
if(url)
{
free(url);
url=NULL;
}
cout<<"不能开辟足够的内存空间给requestBuf in CHttp::Fetch()"<<endl;
return -1;
}
requestBuf[0]=0;
if(strlen(path)<1)//说明请求的是根目录下的网页
{// GET / HTTP/1.0\r\n
tempSize=strlen("GET /")+strlen(HTTP_VERSION)+2;//HTTP_VERSION=="HTTP/1.0"
if(checkBufSize(&requestBuf,&bufsize,tempSize)||snprintf(requestBuf,bufsize,"GET / %s\r\n",HTTP_VERSION)<0)
{
if(url)
{
free(url);
url=NULL;
}
if(requestBuf)
{
free(requestBuf);
requestBuf=NULL;
}
cout<<"1.checekBufSize() error in CHttp::Fetch()"<<endl;
return -1;
}
}//end if
else
{//GET path HTTP/1.0\r\n
tempSize=strlen("GET ")+strlen(path)+strlen(HTTP_VERSION)+4;
if(checkBufSize(&requestBuf,&bufsize,tempSize)||snprintf(requestBuf,bufsize,"GET %s %s\r\n",path,HTTP_VERSION)<0)
{
if(url)
{
free(url);
url=NULL;
}
if(requestBuf)
{
free(requestBuf);
requestBuf=NULL;
}
cout<<"2.checekBufSize() error in CHttp::Fetch()"<<endl;
return -1;
}
}//end else
//Host: www.baidu.com\r\n
tempSize=(int)strlen("Host: ")+(int)strlen(host)+3;
if(checkBufSize(&requestBuf,&bufsize,tempSize+128))
{
if(url)
{
free(url);
url=NULL;
}
if(requestBuf)
{
free(requestBuf);
requestBuf=NULL;
}
cout<<"3.checekBufSize() error in CHttp::Fetch()"<<endl;
return -1;
}
strcat(requestBuf,"Host: ");
strcat(requestBuf,host);
strcat(requestBuf,"\r\n");
//User-Agent: Tse/1.0\r\n
if(!hideUserAgent&&userAgent==NULL)
{//DEFAULT_UESR_AGENT=="Tse" VERSION="1.0"
tempSize=(int)strlen("User-Agent: ")+(int)strlen(DEFAULT_USER_AGENT)+(int)strlen(VERSION)+4;
if(checkBufSize(&requestBuf,&bufsize,tempSize))
{
if(url)
{
free(url);
url=NULL;
}
if(requestBuf)
{
free(requestBuf);
requestBuf=NULL;
}
cout<<"4.checekBufSize() error in CHttp::Fetch()"<<endl;
return -1;
}
strcat(requestBuf,"User-Agent: ");
strcat(requestBuf,DEFAULT_USER_AGENT);
strcat(requestBuf,"/");
strcat(requestBuf,VERSION);
strcat(requestBuf,"\r\n");
}//User-Agent: userAgent\r\n
else if(!hideUserAgent)
{
tempSize=(int)strlen("User-Agent: ")+(int)strlen(userAgent)+3;
if(checkBufSize(&requestBuf,&bufsize,tempSize))
{
if(url)
{
free(url);
url=NULL;
}
if(requestBuf)
{
free(requestBuf);
requestBuf=NULL;
}
cout<<"5.checekBufSize() error in CHttp::Fetch()"<<endl;
return -1;
}
strcat(requestBuf,"User-Agent: ");
strcat(requestBuf,userAgent);
strcat(requestBuf,"\r\n");
}
tempSize=(int)strlen("Connection: Keep-Alive\r\n\r\n");
if(checkBufSize(&requestBuf,&bufsize,tempSize))
{
if(url)
{
free(url);
url=NULL;
}
if(requestBuf)
{
free(requestBuf);
requestBuf=NULL;
}
cout<<"6.checekBufSize() error in CHttp::Fetch()"<<endl;
return -1;
}
strcat(requestBuf,"Connection: Keep-Alive\r\n\r\n");
//重新调整requestBuf的内存空间,释放多余的内存空间
tmp=(char *)realloc(requestBuf,strlen(requestBuf)+1);
if(tmp==NULL)
{
if(url)
{
free(url);
url=NULL;
}
if(requestBuf)
{
free(requestBuf);
requestBuf=NULL;
}
cout<<"requestBuf realloc() error in CHttp::Fetch()"<<endl;
return -1;
}
requestBuf=tmp;
if(*nPSock!=-1)
{
sock=*nPSock;
cout<<"使用当前的socket: "<<*nPSock<<endl;
}
else
{
sock=CreateSocket(host,port);
if(sock==-1)//无效的主机
{
if(url)
{
free(url);
url=NULL;
}
if(requestBuf)
{
free(requestBuf);
requestBuf=NULL;
}
cout<<"1.CreateSocket() error in CHttp::Fetch() "<<endl;
return -3;
}//end if
if(sock==-2)//IP阻塞范围内
{
if(url)
{
free(url);
url=NULL;
}
if(requestBuf)
{
free(requestBuf);
requestBuf=NULL;
}
cout<<"2.CreateSocket() error in CHttp::Fetch() "<<endl;
return -2;
}
}//end if
ret=write(sock,requestBuf,strlen(requestBuf));
if(ret==0)//没有写任何东西
{
cout<<"没有写任何东西 in CHttp::Fetch()"<<endl;
if(url)
{
free(url);
url=NULL;
}
if(requestBuf)
{
free(requestBuf);
requestBuf=NULL;
}
close(sock);
*nPSock=-1;
return -1;
}
if(ret==-1)//write()函数出错
{
close(sock);
*nPSock=-1;
//再次同服务器建立连接
sock=CreateSocket(host,port);
if(sock==-1)//无效的主机
{
if(url)
{
free(url);
url=NULL;
}
if(requestBuf)
{
free(requestBuf);
requestBuf=NULL;
}
cout<<"3.CreateSocket() error in CHttp::Fetch() "<<endl;
return -1;
}
if(sock==-2)//IP阻塞范围内
{
if(url)
{
free(url);
url=NULL;
}
if(requestBuf)
{
free(requestBuf);
requestBuf=NULL;
}
cout<<"4.CreateSocket() error in CHttp::Fetch() "<<endl;
return -1;
}
if(write(sock,requestBuf,strlen(requestBuf))==-1)//write()函数出错
{
if(url)
{
free(url);
url=NULL;
}
if(requestBuf)
{
free(requestBuf);
requestBuf=NULL;
}
close(sock);
*nPSock=-1;
cout<<"write() error in CHttp::Fetch() "<<endl;
return -1;
}
}//end if
if(url)
{
free(url);
url=NULL;
}
if(requestBuf)
{
free(requestBuf);
requestBuf=NULL;
}
char headerBuf[HEADER_BUF_SIZE];
memset(headerBuf,0,sizeof(headerBuf));
ret=read_header(sock,headerBuf);
if(ret<0)
{
close(sock);
*nPSock=-1;
cout<<"1.read_header() error in CHttp::Fetch() "<<endl;
return -1;
}
if(strlen(headerBuf)==0)
{
close(sock);
*nPSock=-1;
cout<<"2.read_header() error in CHttp::Fetch() "<<endl;
return -1;
}
//解析网页头信息
CPage iPage;
iPage.ParseHeaderInfo(headerBuf);
if(iPage.m_nStatusCode==-1)
{
close(sock);
*nPSock=-1;
cout<<"iPage.m_nStatusCode==-1"<<endl;
return -1;
}
if(iPage.m_nStatusCode==301||iPage.m_nStatusCode==302)
{
if(iPage.m_sLocation.empty()||iPage.m_sLocation.length()>URL_LEN)
{
close(sock);
*nPSock=-1;
cout<<"重定向了,但是:iPage.m_sLocation.empty()||iPage.m_sLocation.length()>URL_LEN"<<endl;
return -1;
}
else
{
char *loc=(char *)strdup(iPage.m_sLocation.c_str());
*location=loc;
close(sock);
*nPSock=-1;
cout<<"重定向了"<<endl;
return -300;//重定向了
}
}
if(iPage.m_nStatusCode<200||iPage.m_nStatusCode>299)
{
close(sock);
*nPSock=-1;
cout<<"iPage.m_nStatusCode<200||iPage.m_nStatusCode>299"<<endl;
return -1;
}
if(iPage.m_sContentType.find("image")!=string::npos)
{
close(sock);
*nPSock=-1;
cout<<"iPage.m_sContentType 是image/xxx类型,不是我们想要的网页类型"<<endl;
return -4;
}
if(iPage.m_nContentLength==-1)
{
close(sock);
*nPSock=-1;
cout<<"iPage.m_nContentLength==-1"<<endl;
return -1;
}
if(iPage.m_nContentLength==0||iPage.m_nContentLength<20)
iPage.m_nContentLength=DEFAULT_PAGE_BUF_SIZE;//200k
if(iPage.m_nContentLength>MAX_PAGE_BUF_SIZE)//5M
{
cout<<"这个网页的长度大于5M,我过滤掉它!"<<endl;
close(sock);
*nPSock=-1;
return -1;
}
pageBuf=(char *)malloc(iPage.m_nContentLength);
if(pageBuf==NULL)
{
close(sock);
*nPSock=-1;
cout<<"pageBuf malloc erro in CHttp::Fetch"<<endl;
return -1;
}
//开始读取网页体信息
fd_set rfds;//读文件描述符的集合
struct timeval tv;
int flags;
//将sock套接子文件描述符设置为非阻塞的方式
flags=fcntl(sock,F_GETFL,0);
if(flags<0)
{
close(sock);
*nPSock=-1;
if(pageBuf)
{
free(pageBuf);
pageBuf=NULL;
}
cout<<"1.fcntl() error in CHttp::Fetch()"<<endl;
return -1;
}
flags|=O_NONBLOCK;
if(fcntl(sock,F_SETFL,flags)<0)
{
close(sock);
*nPSock=-1;
if(pageBuf)
{
free(pageBuf);
pageBuf=NULL;
}
cout<<"2.fcntl() error in CHttp::Fetch()"<<endl;
return -1;
}
//挂一个while()循环读取网页体信息
int pre_ret=0;
while(ret>0)
{
FD_ZERO(&rfds);//清理rfds读文件描述符集合
FD_SET(sock,&rfds);//将sock加到rfds读文件描述符集合中
if(bytesRead==iPage.m_nContentLength)
tv.tv_sec=1;
else
tv.tv_sec=timeout;//timeout==30
tv.tv_usec=0;
if(DEFAULT_TIMEOUT>=0)
selectRet=select(sock+1,&rfds,NULL,NULL,&tv);//IO复用
else
selectRet=select(sock+1,&rfds,NULL,NULL,NULL);
if(selectRet==0&&timeout<0)//超时
{
close(sock);
*nPSock=-1;
if(pageBuf)
{
free(pageBuf);
pageBuf=NULL;
}
cout<<"select() over time in CHttp::Fetch()"<<endl;
return -1;
}
else if(selectRet==-1)//select()函数出错
{
close(sock);
*nPSock=-1;
if(pageBuf)
{
free(pageBuf);
pageBuf=NULL;
}
cout<<"select() 出错 in CHttp::Fetch()"<<endl;
return -1;
}
ret=read(sock,pageBuf+bytesRead,iPage.m_nContentLength);//每次最多接收iPage.m_nContentLength字节--缓冲区的大小为iPage.m_nContentLength
if(ret==0)
break;
if(ret==-1&&pre_ret==0)//read()函数出错
{
close(sock);
*nPSock=-1;
if(pageBuf)
{
free(pageBuf);
pageBuf=NULL;
}
cout<<"read() error in CHttp::Fetch()"<<endl;
return -1;
}
else if(ret==-1&&pre_ret)
break;
pre_ret=ret;
bytesRead+=ret;
if(ret>0)
{
pageBuf=(char *)realloc(pageBuf,bytesRead+iPage.m_nContentLength);
if(pageBuf==NULL)
{
close(sock);
*nPSock=-1;
if(pageBuf)
{
free(pageBuf);
pageBuf=NULL;
}
cout<<"2.realloc() error in CHttp::Fetch()"<<endl;
return -1;
}
}//end if
}//end while()
pageBuf=(char *)realloc(pageBuf,bytesRead+1);
if(pageBuf==NULL)
{
close(sock);
*nPSock=-1;
if(pageBuf)
{
free(pageBuf);
pageBuf=NULL;
}
cout<<"3.realloc() error in CHttp::Fetch()"<<endl;
return -1;
}
pageBuf[bytesRead]='\0';
if(fileBuf==NULL)
{
if(pageBuf)
{
free(pageBuf);
pageBuf=NULL;
}
}
else
{
char *tmp;
tmp=(char *)malloc(strlen(headerBuf)+1);
if(tmp==NULL)
{
close(sock);
*nPSock=-1;
if(pageBuf)
{
free(pageBuf);
pageBuf=NULL;
}
cout<<"tmp malloc() error in CHttp::Fetch()"<<endl;
return -1;
}
strncpy(tmp,headerBuf,strlen(headerBuf)+1);
*fileHeadBuf=tmp;
*fileBuf=pageBuf;
}
*nPSock=sock;
return bytesRead;
}