最后面是源代码,前面是简单的原理介绍
曾经在网上找了很多关于PageRank的论文,基本上都是介绍原理的,没有找到计算过程的具体实现的源代码,前一阵公司有需要于是写了一个,比 较简单(没有反作弊,Blog链接的处理,黑洞的处理都没有管),就是用极限编程的思想用最快的速度实现了一个个人感觉计算效率还不错,(没分块,其实 前期分块后对后续的计算过程也是一样的了P4 3.0,512M),1700万网页迭代一次需要25秒钟的样子.
SortMap.dat是一个链接关系文件,已经按照DocID(每个网页的唯一编号,64位整型)进行排序了.基本上对这样的大文件排序都是 进行分割,排序,归并实现的(大家有兴趣的话我下次将排序源代码也贴上来).如果您手头没有这样的链接数据话http:// www.sogou.com/labs/dl/t-link.html(互联网语料链接关系库),这个里面可以提取,然后就是排序,然后就是下面的源代码计算,你就能看到传说中的PageRank了.Good Luck!
PageRank计算实现原理 PageRank基本原理 可能有点帮助吧,中文的PageRank学习笔记
后面是计算PageRank的源代码,希望大家喜欢:(这里贴源代码确实太难看了 )
#include <windows.h> #include <winbase.h> #include <stdio.h>
#include <iostream> #include <fstream> #include <vector> #include <hash_map> #include <hash_set> #include <assert.h> #include "../FileSort/Sort.h" //一个实现快速排序的文件,后面需要排序的函数都在这个文件中
#define BLOCKSIZE (64*1024*1024) //#define GROUP 8 #define LENGTHOFSINGLE 8 #define OUTDEGREESIZE 2048 #pragma pack(1)
using namespace std;
//实现二分查找功能 template <typename type_key> size_t binaryFind(type_key *begin,type_key *end,const type_key &value) { if(begin>=end) return end-begin; //unsigned __int64 size = sizeof(type_key); //long len = (end-begin); int total = (end-begin); int low=0; int high=total-1; int mid=0; unsigned __int64 temp = 0; while(low<=high) { mid = (low+high)/2; temp = *(begin+mid); if(temp==value) { return mid; } if(temp>value) { high=mid-1; continue; } else { low=mid+1; continue; } } return total;
}
//完成将DocID转换成Index的过程,以后每次迭代不用再进行查找,算是中间的辅助函数 unsigned int BuildIndex(FILE * mapFile) { cout<<"start to build index"<<endl; FILE *countFile = fopen("count.dat","rb"); if (countFile==NULL) { cout<<"count.dat can not open,please check the file"<<endl; return 0; }
unsigned int total; unsigned int MaxOutDegree; fread(&total,4,1,countFile); fread(&MaxOutDegree,4,1,countFile); fclose(countFile); cout<<"total is :/t"<<total<<"/tand the MaxOutDegree:"<<MaxOutDegree<<endl; if (total>0) { unsigned __int64 *index = new unsigned __int64[total]; char *sons = new char[MaxOutDegree*(LENGTHOFSINGLE+1)]; FILE *indexMap = fopen("indexMap.dat","wb"); FILE *DocIDIndex = fopen("DocID.index","wb"); unsigned __int64 DocID;
int outDegree; for(size_t i=0;i<total;++i) { fread(index+i,LENGTHOFSINGLE,1,mapFile); fread(&outDegree,4,1,mapFile); fread(sons,LENGTHOFSINGLE+1,outDegree,mapFile); } cout<<"end of read file"<<endl; fwrite(index,LENGTHOFSINGLE,total,DocIDIndex);
rewind(mapFile); size_t position; for(size_t i=0;i<total;++i) { fread(&DocID,LENGTHOFSINGLE,1,mapFile); fread(&outDegree,4,1,mapFile); fread(sons,LENGTHOFSINGLE+1,outDegree,mapFile);
fwrite(&i,4,1,indexMap); fwrite(&outDegree,4,1,indexMap); for (int j=0;j<outDegree;++j) { //fread(sons,LENGTHOFSINGLE+1,1,mapFile); DocID = *(unsigned __int64 *)(sons+j*(LENGTHOFSINGLE+1)); position = binaryFind(index,index+total,DocID); if (position != total) {//找到 fwrite(&position,4,1,indexMap); } else fwrite(&total,4,1,indexMap); } }
fcloseall(); delete []index; delete []sons; } cout<<"over build the index"<<endl; return total;
}
int main(int argc, char *argv[]) { ofstream fout("log.txt");
FILE *file = NULL; if(argc>=2) file = fopen(argv[1],"rb"); else file = fopen("F://ren//SortMap.dat","rb"); if(file==NULL) cout<<"--------can not open the file--------"<<endl;
unsigned int start,end; start = GetTickCount(); size_t total=BuildIndex(file); end = GetTickCount(); cout<<"Build index lost:/t"<<end-start<<endl;
if (total<1) { cout<<"BuildIndex Error and the program exit"<<endl; return 0; }
size_t DocID=0; int outDegree = 0; fclose(file); file = fopen("indexMap.dat","rb"); FILE *DocIDIndex = fopen("DocID.Index","rb"); cout<<"start the total is:"<<total<<endl; if(total>0) { //真正实现的计算过程 unsigned int * sons = new unsigned int[1]; start = GetTickCount(); float *pageRank = new float[total]; float *prTmp = new float[total]; for (size_t i=0;i<total;++i) { pageRank[i] = 1.0f; prTmp[i] = 0.0f; } end = GetTickCount(); cout<<"end of index:"<<end-start<<endl;
ofstream fout("pagerank.txt"); float fatherRank = 1; const float alpha = 0.85f; size_t position = total; size_t index=0; for (size_t iter=0;iter<30;++iter) { rewind(file); start = GetTickCount(); for (size_t i=0;i<total;++i) { fread(&DocID,4,1,file); fread(&outDegree,4,1,file); if (outDegree==0) { continue; } fatherRank = pageRank[i]/outDegree; for (int k=0;k<outDegree;++k) { fread(sons,4,1,file); if (total > *sons && *sons>=0) { prTmp[ *sons ] += fatherRank; } } } for (int i=0;i<total;++i) { prTmp[i] = 0.15f + alpha*prTmp[i]; pageRank[i] = prTmp[i]; prTmp[i] = 0.0f; } end = GetTickCount(); cout<<">>>>>>>>>>>>>>>"<<(iter+1)<<"/t"<<end-start<<endl; } delete []prTmp; delete []sons; cout<<"over is writing"<<endl; FILE * PageRankList = fopen("PageRankList.dat","wb"); fwrite(pageRank,4,total,PageRankList); delete []pageRank; } fout.close(); fclose(file); fclose(DocIDIndex); fcloseall(); cout<<"-------over----------"<<endl; return 1;
}
|