web信息的搜集---Page h+Page cpp<1>

*Page.h

* Created on: 2011-10-12

* Author: qiuxiong

* 功能:解析网页头信息，从网页中提取链接信息

#ifndef PAGE_H_

#define PAGE_H_

#include<string>

#include<map>

#include<vector>

#include<list>

#include"Url.h"

using namespace std;

const int ANCHOR_TEXT_LEN=256;//URL描述符的最大长度

const int MAX_URL_REFERENCES=1000;//从一个网页中最多提取的URL数目

const int URL_REFERENCE_LEN=(URL_LEN+ANCHOR_TEXT_LEN)*MAX_URL_REFERENCES*1/2;//从一个网页提取的标识信息的最大长度

enum page_type{PLAIN_TEXT,OTHER};//自定义的网页类型

//保存URL信息<-->URL的描述信息[这里URL指的是为搜索准备的链接]

struct RefLink4SE

{

char *link;//url

char *anchor_text;//url描述信息

};

//保存URL信息[这个URL指的是为历史网页存档准备的链接]

struct RefLink4History

{

char *link;//url

};

class CPage

{

public:

string m_sUrl;//网页对应的URL字符串

string m_sHeader;//网页头信息

int m_nLenHeader;//网页头信息的长度

int m_nStatusCode;//状态码

int m_nContentLength;//从网页头信息中提取的网页体的长度，一般不是很准

string m_sLocation;//网页的转向信息，可以判断这个网页是否重定向

bool m_bConnectionState;//是否支持持续链接Keep-Alive为true否则为false

string m_sContentEncoding;//网页体的编码

string m_sContentType;//网页体的类型

string m_sCharset;//网页体的字符集

string m_sTransferEncoding;//网页体的传输编码方式

string m_sContent;//网页体信息

int m_nLenContent;//网页体信息的长度

string m_sContentLinkInfo;//从网页体中提取出包含超链接信息的标识,例如<img src="www.baidu.com"/> ,<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area>

string m_sLinkInfo4SE;//再从m_sContentLinkInfo提取出<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area>标识信息

int m_nLenLinkInfo4SE;//m_sLinkInfo4SE的长度

string m_sLinkInfo4History;//再从m_sContentLinkInfo提取出<img src="www.baidu.com">标识信息

int m_nLenLinkInfo4History;//m_sLinkInfo4History的长度

RefLink4SE m_RefLink4SE[MAX_URL_REFERENCES];//保存URL信息<-->URL的描述信息[这里URL指的是为搜索准备的链接]

int m_nRefLink4SENum;//上面数组的长度

RefLink4History m_RefLink4History[MAX_URL_REFERENCES/2];//保存URL信息[这个URL指的是为历史网页存档准备的链接]

int m_nRefLink4HistoryNum;//上面数组的长度

map<string,string>m_mapLink4SE;//保存URL信息<-->URL的描述信息[这里URL指的是为搜索准备的链接]-----当然了这个map容器的作用主要是删除一个网页中相同的URL

vector<string>m_vecLink4History;//保存URL信息--当然了这个vector容器的作用主要是删除一个网页中相同的URL

enum page_type m_eType;//网页的类型

public:

CPage();

CPage(string strUrl,string strLocation,char *header,char *body,int nLenBody);

~CPage();

void ParseHeaderInfo(string header);//解析网页头信息

bool ParseHyperLinks();//从网页中提取出链接信息

bool NormalizeUrl(string &strUrl);//判断strUrl是不是正规的url

bool IsFilterLink(string plink);//判断plink链接是不是要过滤掉

private:

//解析网页头信息

void GetStatusCode(string header);//得到状态码

void GetContentLength(string header);//从网页头信息中提取的网页体的长度，一般不是很准

void GetConnectionState(string header);//得到连接状态

void GetLocation(string header);//得到重定向信息

void GetCharset(string header);//得到字符集

void GetContentEncoding(string header);//得到网页体编码

void GetContentType(string header);//得到网页体类型

void GetTransferEncoding(string header);//得到网页体的传输编码方式

//从网页中提取出链接

bool GetContentLinkInfo();//从网页体中提取出包含超链接信息的标识,例如<img src="www.baidu.com"/> ,<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area>

bool GetLinkInfo4SE();//再从m_sContentLinkInfo提取出<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area>标识信息

bool GetLinkInfo4History();//再从m_sContentLinkInfo提取出<img src="www.baidu.com">标识信息

bool FindRefLink4SE();//最终得到为搜索引擎准备的超链接

bool FindRefLink4History();//最终得到为历史网页存档准备的超链接

};

#endif /* PAGE_H_ */

* Page.cpp

* Created on: 2011-10-12

* Author: qiuxiong

功能:解析网页头信息，从网页中提取链接信息

#include<iostream>

#include<cstdio>

#include<cstring>

#include<string>

#include<map>

#include<vector>

#include<iterator>

#include"Url.h"

#include"Page.h"

#include"StrFun.h"

//无参构造函数

CPage::CPage()

{

m_nStatusCode=0;

m_nContentLength=0;

m_sLocation="";

m_bConnectionState=false;

m_sContentEncoding="";

m_sContentType="";

m_sCharset="";

m_sTransferEncoding="";

m_sContentLinkInfo="";

m_sLinkInfo4SE="";

m_sLinkInfo4History="";

m_nRefLink4SENum=0;

m_nRefLink4HistoryNum=0;

m_eType=PLAIN_TEXT;

for(int i=0;i<MAX_URL_REFERENCES;i++)

{

m_RefLink4SE[i].link=NULL;

m_RefLink4SE[i].anchor_text=NULL;

if(i<MAX_URL_REFERENCES/2)

m_RefLink4History[i].link=NULL;

}

//带参构造函数

CPage::CPage(string strUrl,string strLocation,char *header,char *body,int nLenBody)

{

//初始化成员变量

m_nStatusCode=0;

m_nContentLength=0;

m_sLocation="";

m_bConnectionState=false;

m_sContentEncoding="";

m_sContentType="";

m_sCharset="";

m_sTransferEncoding="";

m_sContentLinkInfo="";

m_sLinkInfo4SE="";

m_sLinkInfo4History="";

m_nRefLink4SENum=0;

m_nRefLink4HistoryNum=0;

m_eType=PLAIN_TEXT;

//超链接信息以及超链接的描述信息初始化都为空

for(int i=0;i<MAX_URL_REFERENCES;i++)

{

m_RefLink4SE[i].link=NULL;

m_RefLink4SE[i].anchor_text=NULL;

if(i<MAX_URL_REFERENCES/2)

m_RefLink4History[i].link=NULL;

}

//将构造函数传入的参数赋值给成员变量

m_sUrl=strUrl;//网页对应的URL

m_sLocation=strLocation;//网页重定向的URL，没有重定向则传入为空，否则传入重定向的URL信息

m_sHeader=header;//网页的头信息

m_nLenHeader=strlen(header);//网页头信息的长度

m_sContent.assign(body,nLenBody);//网页体信息,用body所指向数组的前nLenBody个字符副本替换m_sContent

m_nLenContent=nLenBody;//网页体信息的长度

}

CPage::~CPage()

{

}

//解析网页头信息---调用8个私有的成员函数

void CPage::ParseHeaderInfo(string headerBuf)

{

GetStatusCode(headerBuf);

GetContentLength(headerBuf);

GetConnectionState(headerBuf);

GetLocation(headerBuf);

GetCharset(headerBuf);

GetContentEncoding(headerBuf);

GetContentType(headerBuf);

GetTransferEncoding(headerBuf);

}

//得到状态码

void CPage::GetStatusCode(string headerBuf)

{

//例如：

//HTTP/1.0 200 OK 200就是状态码

CStrFun::Str2Lower(headerBuf,headerBuf.length());

char *charIndex=(char *)strstr(headerBuf.c_str(),"http/");//在字符串headerBuf中查找第一出现"http/"的位置

if(charIndex==NULL)

{

m_nStatusCode=-1;

return;

}

while(*charIndex!=' ')

charIndex++;

int ret=sscanf(charIndex,"%i",&m_nStatusCode);//格式化字符串输入

if(ret!=1)

m_nStatusCode=-1;

}

//从网页头信息中提取的网页体的长度，一般不是很准

void CPage::GetContentLength(string headerBuf)

{

//例如:

//content-length: 21237 21237就是网页体的长度，这个属性值是服务器返回的，不一定正确

CStrFun::Str2Lower(headerBuf,headerBuf.length());

char *charIndex=(char *)strstr(headerBuf.c_str(),"content-length");

if(charIndex==NULL)

return;

while(*charIndex!=' ')

charIndex++;

int ret=sscanf(charIndex,"%i",&m_nContentLength);

if(ret!=1)

m_nContentLength=-1;

}

//得到重定向信息

void CPage::GetLocation(string headerBuf)

{

//例如:

//location: http://www.baidu.com/ http://www.baidu.com/ 就是这个m_sUrl的重定向信息

string::size_type pre_idx,idx;

const string delims("\r\n");

string strBuf=headerBuf;

CStrFun::Str2Lower(headerBuf,headerBuf.length());

idx=headerBuf.find("location:");

if(idx!=string::npos)

{

pre_idx=idx+sizeof("location: ")-1;

idx=headerBuf.find_first_of(delims,pre_idx);

if(idx!=string::npos)

m_sLocation=strBuf.substr(pre_idx,idx-pre_idx);

}

//得到网页字符集

void CPage::GetCharset(string headerBuf)

{

//例如:

//charset=gb2312; gb2312就是这个网页的字符集

string::size_type pre_idx,idx;

const string delims(" \",;>");

CStrFun::Str2Lower(headerBuf,headerBuf.length());

idx=headerBuf.find("charset=");

if(idx!=string::npos)

{

m_sCharset=headerBuf.substr(idx+sizeof("charset=")-1);

}

headerBuf=m_sContent;

headerBuf=headerBuf.substr(0,2024);

CStrFun::Str2Lower(headerBuf,headerBuf.length());

idx=headerBuf.find("charset=");

if(idx!=string::npos)

{

pre_idx=idx+sizeof("charset=")-1;

idx=headerBuf.find_first_of(delims,pre_idx);

m_sCharset=headerBuf.substr(pre_idx,idx-pre_idx);

}

//得到网页体编码

void CPage::GetContentEncoding(string headerBuf)

{

//例如:

//content-encoding: x-compress x-compress就是网页体的编码

string::size_type pre_idx,idx;

const string delims("\r\n");

CStrFun::Str2Lower(headerBuf,headerBuf.length());

idx=headerBuf.find("content-encoding:");

if(idx!=string::npos)

{

pre_idx=idx+sizeof("content-encoding: ")-1;

idx=headerBuf.find_first_of(delims,pre_idx);

if(idx!=string::npos)

m_sContentEncoding=headerBuf.substr(pre_idx,idx-pre_idx);

}

//得到连接状态

void CPage::GetConnectionState(string headerBuf)

{

//例如:

//Connection: Keep-Alive 或者Connection: Close 如果是Keep-Alive则链接状态为true支持持续连接否为false链接关闭，不支持持续连接

string::size_type pre_idx,idx;

const string delims(";\r\n");

CStrFun::Str2Lower(headerBuf,headerBuf.length());

idx=headerBuf.find("connection:");

if(idx!=string::npos)

{

pre_idx=idx+sizeof("connection: ")-1;

idx=headerBuf.find_first_of(delims,pre_idx);

if(idx!=string::npos)

{

string str=headerBuf.substr(pre_idx,idx-pre_idx);

if(str=="keep-alive")

m_bConnectionState=true;

}

//得到网页体类型

void CPage::GetContentType(string headerBuf)

{

//例如:

//content-type: imge/gif imge/gif就是网页体的类型

string::size_type pre_idx,idx;

const string delims(";\r\n");

CStrFun::Str2Lower(headerBuf,headerBuf.length());

idx=headerBuf.find("content-type:");

if(idx!=string::npos)

{

pre_idx=idx+sizeof("content-type: ")-1;

idx=headerBuf.find_first_of(delims,pre_idx);

if(idx!=string::npos)

m_sContentType=headerBuf.substr(pre_idx,idx-pre_idx);

}

//得到网页体的传输编码方式

void CPage::GetTransferEncoding(string headerBuf)

{

//例如:

//transfer-encoding: gzip gzip就是传输编码方式

string::size_type pre_idx,idx;

const string delims(";\r\n");

CStrFun::Str2Lower(headerBuf,headerBuf.length());

idx=headerBuf.find("transfer-encoding:");

if(idx!=string::npos)

{

pre_idx=idx+sizeof("transfer-encoding: ")-1;

idx=headerBuf.find_first_of(delims,pre_idx);

if(idx!=string::npos)

m_sTransferEncoding=headerBuf.substr(pre_idx,idx-pre_idx);

}

//判断一个URL是不是应该过滤，要过滤返回true否则返回false

bool CPage::IsFilterLink(string plink)

{

if(plink.empty())//空的URL肯定是要过滤的

return true;

if(plink.size()>URL_LEN)//URL的长度超过了我们定义的长度256肯定是要过滤的

return true;

string link=plink,tmp;

CStrFun::Str2Lower(link,link.length());//link字符串中的字母全部变成小写

string::size_type idx=0;

//URL中出现2个'?'字符要过滤

tmp=link;

idx=tmp.find("?");

if(idx!=string::npos)//第一次出现'?'字符

{

tmp=tmp.substr(idx+1);

idx=tmp.find("?");

if(idx!=string::npos)//第二次出现'?'字符

return true;

}

//先后出现'-'和'+'字符要过滤

tmp=link;

idx=tmp.find("-");

if(idx!=string::npos)

{

tmp=tmp.substr(idx+1);

idx=tmp.find("+");

if(idx!=string::npos)

return true;

}

//出现2个'&'字符要过滤

tmp=link;

idx=tmp.find("&");

if(idx!=string::npos)

{

tmp=tmp.substr(idx+1);

idx=tmp.find("&");

if(idx!=string::npos)

return true;

}

//出现2个"//"字符要过滤

tmp=link;

idx=tmp.find("//");

if(idx!=string::npos)

{

tmp=tmp.substr(idx+1);

idx=tmp.find("//");

if(idx!=string::npos)

return true;

}

//出现2个"http"要过滤

tmp=link;

idx=tmp.find("http");

if(idx!=string::npos)

{

tmp=tmp.substr(idx+1);

idx=tmp.find("http");

if(idx!=string::npos)

return true;

}

//出现2个"misc"要过滤

tmp=link;

idx=tmp.find("misc");

if(idx!=string::npos)

{

tmp=tmp.substr(idx+1);

idx=tmp.find("misc");

if(idx!=string::npos)

return true;

}

//出现2个"ipb"要过滤

tmp=link;

idx=tmp.find("ipb");

if(idx!=string::npos)

{

tmp=tmp.substr(idx+1);

idx=tmp.find("ipb");

if(idx!=string::npos)

return true;

}

const char *filter_str[]=

{

"cgi-bin","htbin","linder","srs5","uin-cgi",

"uhtbin","snapshot","=+","=-","script",

"gate","search","clickfile","data/scop","names",

"staff/","enter","user","mail","pst?",

"find?","ccc?","fwd?","tcon?","&amp",

"counter?","forum","cgisirsi","{","}",

"proxy","login","00.pl?","sciserv.pl","sign.asp",

"<",">","review.asp?","result.asp?","keyword",

"\"","'","php?s=","error","showdate",

"niceprot.pl?","volue.asp?id",".css",".asp?month","prot.pl?",

"msg.asp","register.asp", "database","reg.asp","qry?u",

"p?msg","tj_all.asp?page", ".plot.","comment.php","nicezyme.pl?",

"entr","compute-map?", "view-pdb?","list.cgi?","lists.cgi?",

"details.pl?","aligner?","raw.pl?","interface.pl?","memcp.php?",

"member.php?","post.php?","thread.php","bbs/","/bbs"

};

int filter_str_num = 75;

for(int i=0;i<filter_str_num;i++)

if(link.find(filter_str[i])!=string::npos)//说明找到了上述字符串要过滤

return true;

return false;

}

web信息的搜集---Page h+Page cpp<1>

web信息的搜集---Page.h+Page.cpp<1>